aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Makefile83
-rw-r--r--kernel/audit.c158
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c133
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/capability.c13
-rw-r--r--kernel/cgroup.c302
-rw-r--r--kernel/context_tracking.c14
-rw-r--r--kernel/cpu.c49
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_debugger.c5
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/elfcore.c10
-rw-r--r--kernel/events/core.c204
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c126
-rw-r--r--kernel/events/uprobes.c227
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c53
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/Kconfig30
-rw-r--r--kernel/gcov/Makefile32
-rw-r--r--kernel/gcov/base.c32
-rw-r--r--kernel/gcov/fs.c54
-rw-r--r--kernel/gcov/gcc_3_4.c115
-rw-r--r--kernel/gcov/gcc_4_7.c560
-rw-r--r--kernel/gcov/gcov.h65
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c17
-rw-r--r--kernel/irq/Kconfig12
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/jump_label.c5
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c99
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c73
-rw-r--r--kernel/locking/Makefile25
-rw-r--r--kernel/locking/lglock.c (renamed from kernel/lglock.c)0
-rw-r--r--kernel/locking/lockdep.c (renamed from kernel/lockdep.c)8
-rw-r--r--kernel/locking/lockdep_internals.h (renamed from kernel/lockdep_internals.h)0
-rw-r--r--kernel/locking/lockdep_proc.c (renamed from kernel/lockdep_proc.c)15
-rw-r--r--kernel/locking/lockdep_states.h (renamed from kernel/lockdep_states.h)0
-rw-r--r--kernel/locking/mutex-debug.c (renamed from kernel/mutex-debug.c)0
-rw-r--r--kernel/locking/mutex-debug.h (renamed from kernel/mutex-debug.h)0
-rw-r--r--kernel/locking/mutex.c (renamed from kernel/mutex.c)34
-rw-r--r--kernel/locking/mutex.h (renamed from kernel/mutex.h)0
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/rtmutex-debug.c (renamed from kernel/rtmutex-debug.c)0
-rw-r--r--kernel/locking/rtmutex-debug.h (renamed from kernel/rtmutex-debug.h)0
-rw-r--r--kernel/locking/rtmutex-tester.c (renamed from kernel/rtmutex-tester.c)0
-rw-r--r--kernel/locking/rtmutex.c (renamed from kernel/rtmutex.c)0
-rw-r--r--kernel/locking/rtmutex.h (renamed from kernel/rtmutex.h)0
-rw-r--r--kernel/locking/rtmutex_common.h (renamed from kernel/rtmutex_common.h)0
-rw-r--r--kernel/locking/rwsem-spinlock.c296
-rw-r--r--kernel/locking/rwsem-xadd.c293
-rw-r--r--kernel/locking/rwsem.c (renamed from kernel/rwsem.c)0
-rw-r--r--kernel/locking/semaphore.c (renamed from kernel/semaphore.c)0
-rw-r--r--kernel/locking/spinlock.c (renamed from kernel/spinlock.c)14
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/modsign_certificate.S12
-rw-r--r--kernel/modsign_pubkey.c104
-rw-r--r--kernel/module-internal.h2
-rw-r--r--kernel/module.c169
-rw-r--r--kernel/module_signing.c11
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c10
-rw-r--r--kernel/power/Kconfig16
-rw-r--r--kernel/power/hibernate.c47
-rw-r--r--kernel/power/qos.c26
-rw-r--r--kernel/power/snapshot.c26
-rw-r--r--kernel/power/user.c53
-rw-r--r--kernel/printk/printk.c35
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h (renamed from kernel/rcu.h)7
-rw-r--r--kernel/rcu/srcu.c (renamed from kernel/srcu.c)0
-rw-r--r--kernel/rcu/tiny.c (renamed from kernel/rcutiny.c)37
-rw-r--r--kernel/rcu/tiny_plugin.h (renamed from kernel/rcutiny_plugin.h)0
-rw-r--r--kernel/rcu/torture.c (renamed from kernel/rcutorture.c)6
-rw-r--r--kernel/rcu/tree.c (renamed from kernel/rcutree.c)200
-rw-r--r--kernel/rcu/tree.h (renamed from kernel/rcutree.h)2
-rw-r--r--kernel/rcu/tree_plugin.h (renamed from kernel/rcutree_plugin.h)84
-rw-r--r--kernel/rcu/tree_trace.c (renamed from kernel/rcutree_trace.c)2
-rw-r--r--kernel/rcu/update.c (renamed from kernel/rcupdate.c)12
-rw-r--r--kernel/reboot.c9
-rw-r--r--kernel/res_counter.c25
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c697
-rw-r--r--kernel/sched/debug.c74
-rw-r--r--kernel/sched/fair.c1449
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h54
-rw-r--r--kernel/sched/stats.h51
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)127
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smp.c35
-rw-r--r--kernel/softirq.c189
-rw-r--r--kernel/stop_machine.c303
-rw-r--r--kernel/sys.c21
-rw-r--r--kernel/sysctl.c42
-rw-r--r--kernel/sysctl_binary.c6
-rw-r--r--kernel/system_certificates.S10
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/task_work.c40
-rw-r--r--kernel/taskstats.c54
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c67
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/ntp.c9
-rw-r--r--kernel/time/sched_clock.c114
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/blktrace.c36
-rw-r--r--kernel/trace/ftrace.c178
-rw-r--r--kernel/trace/trace.c122
-rw-r--r--kernel/trace/trace.h52
-rw-r--r--kernel/trace/trace_branch.c2
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c239
-rw-r--r--kernel/trace/trace_events_filter.c218
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_stat.c41
-rw-r--r--kernel/trace/trace_syscalls.c52
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/up.c69
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c8
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c60
156 files changed, 6865 insertions, 2984 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b03..2a202a846757 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK 57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) 58 def_bool HIGH_RES_TIMERS
diff --git a/kernel/Makefile b/kernel/Makefile
index 35ef1185e359..bbaf7d59c1bb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,55 +6,44 @@ obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o sys_ni.o posix-cpu-timers.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o nsproxy.o \
12 notifier.o ksysfs.o cred.o reboot.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
17CFLAGS_REMOVE_lockdep.o = -pg
18CFLAGS_REMOVE_lockdep_proc.o = -pg
19CFLAGS_REMOVE_mutex-debug.o = -pg
20CFLAGS_REMOVE_rtmutex-debug.o = -pg
21CFLAGS_REMOVE_cgroup-debug.o = -pg 17CFLAGS_REMOVE_cgroup-debug.o = -pg
22CFLAGS_REMOVE_irq_work.o = -pg 18CFLAGS_REMOVE_irq_work.o = -pg
23endif 19endif
24 20
25obj-y += sched/ 21obj-y += sched/
22obj-y += locking/
26obj-y += power/ 23obj-y += power/
27obj-y += printk/ 24obj-y += printk/
28obj-y += cpu/ 25obj-y += cpu/
26obj-y += irq/
27obj-y += rcu/
29 28
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 30obj-$(CONFIG_FREEZER) += freezer.o
32obj-$(CONFIG_PROFILING) += profile.o 31obj-$(CONFIG_PROFILING) += profile.o
33obj-$(CONFIG_STACKTRACE) += stacktrace.o 32obj-$(CONFIG_STACKTRACE) += stacktrace.o
34obj-y += time/ 33obj-y += time/
35obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
36obj-$(CONFIG_LOCKDEP) += lockdep.o
37ifeq ($(CONFIG_PROC_FS),y)
38obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
39endif
40obj-$(CONFIG_FUTEX) += futex.o 34obj-$(CONFIG_FUTEX) += futex.o
41ifeq ($(CONFIG_COMPAT),y) 35ifeq ($(CONFIG_COMPAT),y)
42obj-$(CONFIG_FUTEX) += futex_compat.o 36obj-$(CONFIG_FUTEX) += futex_compat.o
43endif 37endif
44obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
45obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 38obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
48obj-$(CONFIG_SMP) += smp.o 39obj-$(CONFIG_SMP) += smp.o
49ifneq ($(CONFIG_SMP),y) 40ifneq ($(CONFIG_SMP),y)
50obj-y += up.o 41obj-y += up.o
51endif 42endif
52obj-$(CONFIG_SMP) += spinlock.o
53obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
55obj-$(CONFIG_UID16) += uid16.o 43obj-$(CONFIG_UID16) += uid16.o
44obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
56obj-$(CONFIG_MODULES) += module.o 45obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o 46obj-$(CONFIG_MODULE_SIG) += module_signing.o
58obj-$(CONFIG_KALLSYMS) += kallsyms.o 47obj-$(CONFIG_KALLSYMS) += kallsyms.o
59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 48obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
60obj-$(CONFIG_KEXEC) += kexec.o 49obj-$(CONFIG_KEXEC) += kexec.o
@@ -79,14 +68,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
79obj-$(CONFIG_KGDB) += debug/ 68obj-$(CONFIG_KGDB) += debug/
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 69obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 70obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
83obj-$(CONFIG_SECCOMP) += seccomp.o 71obj-$(CONFIG_SECCOMP) += seccomp.o
84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
85obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
90obj-$(CONFIG_RELAY) += relay.o 72obj-$(CONFIG_RELAY) += relay.o
91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 73obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 74obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -141,19 +123,52 @@ targets += timeconst.h
141$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE 123$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
142 $(call if_changed,bc) 124 $(call if_changed,bc)
143 125
144ifeq ($(CONFIG_MODULE_SIG),y) 126###############################################################################
127#
128# Roll all the X.509 certificates that we can find together and pull them into
129# the kernel so that they get loaded into the system trusted keyring during
130# boot.
145# 131#
146# Pull the signing certificate and any extra certificates into the kernel 132# We look in the source root and the build root for all files whose name ends
133# in ".x509". Unfortunately, this will generate duplicate filenames, so we
134# have make canonicalise the pathnames and then sort them to discard the
135# duplicates.
147# 136#
137###############################################################################
138ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
139X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
140X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509
141X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
142 $(or $(realpath $(CERT)),$(CERT))))
143
144ifeq ($(X509_CERTIFICATES),)
145$(warning *** No X.509 certificates found ***)
146endif
148 147
149quiet_cmd_touch = TOUCH $@ 148ifneq ($(wildcard $(obj)/.x509.list),)
150 cmd_touch = touch $@ 149ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
150$(info X.509 certificate list changed)
151$(shell rm $(obj)/.x509.list)
152endif
153endif
154
155kernel/system_certificates.o: $(obj)/x509_certificate_list
156
157quiet_cmd_x509certs = CERTS $@
158 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
151 159
152extra_certificates: 160targets += $(obj)/x509_certificate_list
153 $(call cmd,touch) 161$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
162 $(call if_changed,x509certs)
154 163
155kernel/modsign_certificate.o: signing_key.x509 extra_certificates 164targets += $(obj)/.x509.list
165$(obj)/.x509.list:
166 @echo $(X509_CERTIFICATES) >$@
156 167
168clean-files := x509_certificate_list .x509.list
169endif
170
171ifeq ($(CONFIG_MODULE_SIG),y)
157############################################################################### 172###############################################################################
158# 173#
159# If module signing is requested, say by allyesconfig, but a key has not been 174# If module signing is requested, say by allyesconfig, but a key has not been
diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a9..906ae5a0233a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -60,7 +60,6 @@
60#ifdef CONFIG_SECURITY 60#ifdef CONFIG_SECURITY
61#include <linux/security.h> 61#include <linux/security.h>
62#endif 62#endif
63#include <net/netlink.h>
64#include <linux/freezer.h> 63#include <linux/freezer.h>
65#include <linux/tty.h> 64#include <linux/tty.h>
66#include <linux/pid_namespace.h> 65#include <linux/pid_namespace.h>
@@ -140,6 +139,17 @@ static struct task_struct *kauditd_task;
140static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 139static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
141static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 140static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
142 141
142static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
143 .mask = -1,
144 .features = 0,
145 .lock = 0,};
146
147static char *audit_feature_names[2] = {
148 "only_unset_loginuid",
149 "loginuid_immutable",
150};
151
152
143/* Serialize requests from userspace. */ 153/* Serialize requests from userspace. */
144DEFINE_MUTEX(audit_cmd_mutex); 154DEFINE_MUTEX(audit_cmd_mutex);
145 155
@@ -584,6 +594,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
584 return -EOPNOTSUPP; 594 return -EOPNOTSUPP;
585 case AUDIT_GET: 595 case AUDIT_GET:
586 case AUDIT_SET: 596 case AUDIT_SET:
597 case AUDIT_GET_FEATURE:
598 case AUDIT_SET_FEATURE:
587 case AUDIT_LIST_RULES: 599 case AUDIT_LIST_RULES:
588 case AUDIT_ADD_RULE: 600 case AUDIT_ADD_RULE:
589 case AUDIT_DEL_RULE: 601 case AUDIT_DEL_RULE:
@@ -613,7 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
613 int rc = 0; 625 int rc = 0;
614 uid_t uid = from_kuid(&init_user_ns, current_uid()); 626 uid_t uid = from_kuid(&init_user_ns, current_uid());
615 627
616 if (!audit_enabled) { 628 if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
617 *ab = NULL; 629 *ab = NULL;
618 return rc; 630 return rc;
619 } 631 }
@@ -628,6 +640,94 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
628 return rc; 640 return rc;
629} 641}
630 642
643int is_audit_feature_set(int i)
644{
645 return af.features & AUDIT_FEATURE_TO_MASK(i);
646}
647
648
649static int audit_get_feature(struct sk_buff *skb)
650{
651 u32 seq;
652
653 seq = nlmsg_hdr(skb)->nlmsg_seq;
654
655 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
656 &af, sizeof(af));
657
658 return 0;
659}
660
661static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
662 u32 old_lock, u32 new_lock, int res)
663{
664 struct audit_buffer *ab;
665
666 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
667 audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d",
668 audit_feature_names[which], !!old_feature, !!new_feature,
669 !!old_lock, !!new_lock, res);
670 audit_log_end(ab);
671}
672
673static int audit_set_feature(struct sk_buff *skb)
674{
675 struct audit_features *uaf;
676 int i;
677
678 BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
679 uaf = nlmsg_data(nlmsg_hdr(skb));
680
681 /* if there is ever a version 2 we should handle that here */
682
683 for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
684 u32 feature = AUDIT_FEATURE_TO_MASK(i);
685 u32 old_feature, new_feature, old_lock, new_lock;
686
687 /* if we are not changing this feature, move along */
688 if (!(feature & uaf->mask))
689 continue;
690
691 old_feature = af.features & feature;
692 new_feature = uaf->features & feature;
693 new_lock = (uaf->lock | af.lock) & feature;
694 old_lock = af.lock & feature;
695
696 /* are we changing a locked feature? */
697 if ((af.lock & feature) && (new_feature != old_feature)) {
698 audit_log_feature_change(i, old_feature, new_feature,
699 old_lock, new_lock, 0);
700 return -EPERM;
701 }
702 }
703 /* nothing invalid, do the changes */
704 for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
705 u32 feature = AUDIT_FEATURE_TO_MASK(i);
706 u32 old_feature, new_feature, old_lock, new_lock;
707
708 /* if we are not changing this feature, move along */
709 if (!(feature & uaf->mask))
710 continue;
711
712 old_feature = af.features & feature;
713 new_feature = uaf->features & feature;
714 old_lock = af.lock & feature;
715 new_lock = (uaf->lock | af.lock) & feature;
716
717 if (new_feature != old_feature)
718 audit_log_feature_change(i, old_feature, new_feature,
719 old_lock, new_lock, 1);
720
721 if (new_feature)
722 af.features |= feature;
723 else
724 af.features &= ~feature;
725 af.lock |= new_lock;
726 }
727
728 return 0;
729}
730
631static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 731static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
632{ 732{
633 u32 seq; 733 u32 seq;
@@ -659,6 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
659 759
660 switch (msg_type) { 760 switch (msg_type) {
661 case AUDIT_GET: 761 case AUDIT_GET:
762 memset(&status_set, 0, sizeof(status_set));
662 status_set.enabled = audit_enabled; 763 status_set.enabled = audit_enabled;
663 status_set.failure = audit_failure; 764 status_set.failure = audit_failure;
664 status_set.pid = audit_pid; 765 status_set.pid = audit_pid;
@@ -670,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
670 &status_set, sizeof(status_set)); 771 &status_set, sizeof(status_set));
671 break; 772 break;
672 case AUDIT_SET: 773 case AUDIT_SET:
673 if (nlh->nlmsg_len < sizeof(struct audit_status)) 774 if (nlmsg_len(nlh) < sizeof(struct audit_status))
674 return -EINVAL; 775 return -EINVAL;
675 status_get = (struct audit_status *)data; 776 status_get = (struct audit_status *)data;
676 if (status_get->mask & AUDIT_STATUS_ENABLED) { 777 if (status_get->mask & AUDIT_STATUS_ENABLED) {
@@ -699,6 +800,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
699 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 800 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
700 err = audit_set_backlog_limit(status_get->backlog_limit); 801 err = audit_set_backlog_limit(status_get->backlog_limit);
701 break; 802 break;
803 case AUDIT_GET_FEATURE:
804 err = audit_get_feature(skb);
805 if (err)
806 return err;
807 break;
808 case AUDIT_SET_FEATURE:
809 err = audit_set_feature(skb);
810 if (err)
811 return err;
812 break;
702 case AUDIT_USER: 813 case AUDIT_USER:
703 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 814 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
704 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 815 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
@@ -715,7 +826,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
715 } 826 }
716 audit_log_common_recv_msg(&ab, msg_type); 827 audit_log_common_recv_msg(&ab, msg_type);
717 if (msg_type != AUDIT_USER_TTY) 828 if (msg_type != AUDIT_USER_TTY)
718 audit_log_format(ab, " msg='%.1024s'", 829 audit_log_format(ab, " msg='%.*s'",
830 AUDIT_MESSAGE_TEXT_MAX,
719 (char *)data); 831 (char *)data);
720 else { 832 else {
721 int size; 833 int size;
@@ -818,7 +930,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
818 struct task_struct *tsk = current; 930 struct task_struct *tsk = current;
819 931
820 spin_lock(&tsk->sighand->siglock); 932 spin_lock(&tsk->sighand->siglock);
821 s.enabled = tsk->signal->audit_tty != 0; 933 s.enabled = tsk->signal->audit_tty;
822 s.log_passwd = tsk->signal->audit_tty_log_passwd; 934 s.log_passwd = tsk->signal->audit_tty_log_passwd;
823 spin_unlock(&tsk->sighand->siglock); 935 spin_unlock(&tsk->sighand->siglock);
824 936
@@ -832,7 +944,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
832 944
833 memset(&s, 0, sizeof(s)); 945 memset(&s, 0, sizeof(s));
834 /* guard against past and future API changes */ 946 /* guard against past and future API changes */
835 memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); 947 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
836 if ((s.enabled != 0 && s.enabled != 1) || 948 if ((s.enabled != 0 && s.enabled != 1) ||
837 (s.log_passwd != 0 && s.log_passwd != 1)) 949 (s.log_passwd != 0 && s.log_passwd != 1))
838 return -EINVAL; 950 return -EINVAL;
@@ -1067,13 +1179,6 @@ static void wait_for_auditd(unsigned long sleep_time)
1067 remove_wait_queue(&audit_backlog_wait, &wait); 1179 remove_wait_queue(&audit_backlog_wait, &wait);
1068} 1180}
1069 1181
1070/* Obtain an audit buffer. This routine does locking to obtain the
1071 * audit buffer, but then no locking is required for calls to
1072 * audit_log_*format. If the tsk is a task that is currently in a
1073 * syscall, then the syscall is marked as auditable and an audit record
1074 * will be written at syscall exit. If there is no associated task, tsk
1075 * should be NULL. */
1076
1077/** 1182/**
1078 * audit_log_start - obtain an audit buffer 1183 * audit_log_start - obtain an audit buffer
1079 * @ctx: audit_context (may be NULL) 1184 * @ctx: audit_context (may be NULL)
@@ -1117,9 +1222,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1117 1222
1118 sleep_time = timeout_start + audit_backlog_wait_time - 1223 sleep_time = timeout_start + audit_backlog_wait_time -
1119 jiffies; 1224 jiffies;
1120 if ((long)sleep_time > 0) 1225 if ((long)sleep_time > 0) {
1121 wait_for_auditd(sleep_time); 1226 wait_for_auditd(sleep_time);
1122 continue; 1227 continue;
1228 }
1123 } 1229 }
1124 if (audit_rate_check() && printk_ratelimit()) 1230 if (audit_rate_check() && printk_ratelimit())
1125 printk(KERN_WARNING 1231 printk(KERN_WARNING
@@ -1388,7 +1494,7 @@ void audit_log_session_info(struct audit_buffer *ab)
1388 u32 sessionid = audit_get_sessionid(current); 1494 u32 sessionid = audit_get_sessionid(current);
1389 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); 1495 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1390 1496
1391 audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); 1497 audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
1392} 1498}
1393 1499
1394void audit_log_key(struct audit_buffer *ab, char *key) 1500void audit_log_key(struct audit_buffer *ab, char *key)
@@ -1535,6 +1641,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
1535 } 1641 }
1536 } 1642 }
1537 1643
1644 /* log the audit_names record type */
1645 audit_log_format(ab, " nametype=");
1646 switch(n->type) {
1647 case AUDIT_TYPE_NORMAL:
1648 audit_log_format(ab, "NORMAL");
1649 break;
1650 case AUDIT_TYPE_PARENT:
1651 audit_log_format(ab, "PARENT");
1652 break;
1653 case AUDIT_TYPE_CHILD_DELETE:
1654 audit_log_format(ab, "DELETE");
1655 break;
1656 case AUDIT_TYPE_CHILD_CREATE:
1657 audit_log_format(ab, "CREATE");
1658 break;
1659 default:
1660 audit_log_format(ab, "UNKNOWN");
1661 break;
1662 }
1663
1538 audit_log_fcaps(ab, n); 1664 audit_log_fcaps(ab, n);
1539 audit_log_end(ab); 1665 audit_log_end(ab);
1540} 1666}
diff --git a/kernel/audit.h b/kernel/audit.h
index 123c9b7c3979..b779642b29af 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -197,6 +197,9 @@ struct audit_context {
197 int fd; 197 int fd;
198 int flags; 198 int flags;
199 } mmap; 199 } mmap;
200 struct {
201 int argc;
202 } execve;
200 }; 203 };
201 int fds[2]; 204 int fds[2];
202 205
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f7aee8be7fb2..51f3fd4c1ed3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -343,6 +343,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
343 case AUDIT_DEVMINOR: 343 case AUDIT_DEVMINOR:
344 case AUDIT_EXIT: 344 case AUDIT_EXIT:
345 case AUDIT_SUCCESS: 345 case AUDIT_SUCCESS:
346 case AUDIT_INODE:
346 /* bit ops are only useful on syscall args */ 347 /* bit ops are only useful on syscall args */
347 if (f->op == Audit_bitmask || f->op == Audit_bittest) 348 if (f->op == Audit_bitmask || f->op == Audit_bittest)
348 return -EINVAL; 349 return -EINVAL;
@@ -423,7 +424,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
423 f->lsm_rule = NULL; 424 f->lsm_rule = NULL;
424 425
425 /* Support legacy tests for a valid loginuid */ 426 /* Support legacy tests for a valid loginuid */
426 if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { 427 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
427 f->type = AUDIT_LOGINUID_SET; 428 f->type = AUDIT_LOGINUID_SET;
428 f->val = 0; 429 f->val = 0;
429 } 430 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9845cb32b60a..90594c9f7552 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -95,13 +95,6 @@ struct audit_aux_data {
95/* Number of target pids per aux struct. */ 95/* Number of target pids per aux struct. */
96#define AUDIT_AUX_PIDS 16 96#define AUDIT_AUX_PIDS 16
97 97
98struct audit_aux_data_execve {
99 struct audit_aux_data d;
100 int argc;
101 int envc;
102 struct mm_struct *mm;
103};
104
105struct audit_aux_data_pids { 98struct audit_aux_data_pids {
106 struct audit_aux_data d; 99 struct audit_aux_data d;
107 pid_t target_pid[AUDIT_AUX_PIDS]; 100 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -121,12 +114,6 @@ struct audit_aux_data_bprm_fcaps {
121 struct audit_cap_data new_pcap; 114 struct audit_cap_data new_pcap;
122}; 115};
123 116
124struct audit_aux_data_capset {
125 struct audit_aux_data d;
126 pid_t pid;
127 struct audit_cap_data cap;
128};
129
130struct audit_tree_refs { 117struct audit_tree_refs {
131 struct audit_tree_refs *next; 118 struct audit_tree_refs *next;
132 struct audit_chunk *c[31]; 119 struct audit_chunk *c[31];
@@ -566,7 +553,7 @@ static int audit_filter_rules(struct task_struct *tsk,
566 break; 553 break;
567 case AUDIT_INODE: 554 case AUDIT_INODE:
568 if (name) 555 if (name)
569 result = (name->ino == f->val); 556 result = audit_comparator(name->ino, f->op, f->val);
570 else if (ctx) { 557 else if (ctx) {
571 list_for_each_entry(n, &ctx->names_list, list) { 558 list_for_each_entry(n, &ctx->names_list, list) {
572 if (audit_comparator(n->ino, f->op, f->val)) { 559 if (audit_comparator(n->ino, f->op, f->val)) {
@@ -943,8 +930,10 @@ int audit_alloc(struct task_struct *tsk)
943 return 0; /* Return if not auditing. */ 930 return 0; /* Return if not auditing. */
944 931
945 state = audit_filter_task(tsk, &key); 932 state = audit_filter_task(tsk, &key);
946 if (state == AUDIT_DISABLED) 933 if (state == AUDIT_DISABLED) {
934 clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
947 return 0; 935 return 0;
936 }
948 937
949 if (!(context = audit_alloc_context(state))) { 938 if (!(context = audit_alloc_context(state))) {
950 kfree(key); 939 kfree(key);
@@ -1149,20 +1138,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1149} 1138}
1150 1139
1151static void audit_log_execve_info(struct audit_context *context, 1140static void audit_log_execve_info(struct audit_context *context,
1152 struct audit_buffer **ab, 1141 struct audit_buffer **ab)
1153 struct audit_aux_data_execve *axi)
1154{ 1142{
1155 int i, len; 1143 int i, len;
1156 size_t len_sent = 0; 1144 size_t len_sent = 0;
1157 const char __user *p; 1145 const char __user *p;
1158 char *buf; 1146 char *buf;
1159 1147
1160 if (axi->mm != current->mm) 1148 p = (const char __user *)current->mm->arg_start;
1161 return; /* execve failed, no additional info */
1162
1163 p = (const char __user *)axi->mm->arg_start;
1164 1149
1165 audit_log_format(*ab, "argc=%d", axi->argc); 1150 audit_log_format(*ab, "argc=%d", context->execve.argc);
1166 1151
1167 /* 1152 /*
1168 * we need some kernel buffer to hold the userspace args. Just 1153 * we need some kernel buffer to hold the userspace args. Just
@@ -1176,7 +1161,7 @@ static void audit_log_execve_info(struct audit_context *context,
1176 return; 1161 return;
1177 } 1162 }
1178 1163
1179 for (i = 0; i < axi->argc; i++) { 1164 for (i = 0; i < context->execve.argc; i++) {
1180 len = audit_log_single_execve_arg(context, ab, i, 1165 len = audit_log_single_execve_arg(context, ab, i,
1181 &len_sent, p, buf); 1166 &len_sent, p, buf);
1182 if (len <= 0) 1167 if (len <= 0)
@@ -1279,6 +1264,9 @@ static void show_special(struct audit_context *context, int *call_panic)
1279 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, 1264 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
1280 context->mmap.flags); 1265 context->mmap.flags);
1281 break; } 1266 break; }
1267 case AUDIT_EXECVE: {
1268 audit_log_execve_info(context, &ab);
1269 break; }
1282 } 1270 }
1283 audit_log_end(ab); 1271 audit_log_end(ab);
1284} 1272}
@@ -1325,11 +1313,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1325 1313
1326 switch (aux->type) { 1314 switch (aux->type) {
1327 1315
1328 case AUDIT_EXECVE: {
1329 struct audit_aux_data_execve *axi = (void *)aux;
1330 audit_log_execve_info(context, &ab, axi);
1331 break; }
1332
1333 case AUDIT_BPRM_FCAPS: { 1316 case AUDIT_BPRM_FCAPS: {
1334 struct audit_aux_data_bprm_fcaps *axs = (void *)aux; 1317 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1335 audit_log_format(ab, "fver=%x", axs->fcap_ver); 1318 audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1964,6 +1947,43 @@ int auditsc_get_stamp(struct audit_context *ctx,
1964/* global counter which is incremented every time something logs in */ 1947/* global counter which is incremented every time something logs in */
1965static atomic_t session_id = ATOMIC_INIT(0); 1948static atomic_t session_id = ATOMIC_INIT(0);
1966 1949
1950static int audit_set_loginuid_perm(kuid_t loginuid)
1951{
1952 /* if we are unset, we don't need privs */
1953 if (!audit_loginuid_set(current))
1954 return 0;
1955 /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
1956 if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
1957 return -EPERM;
1958 /* it is set, you need permission */
1959 if (!capable(CAP_AUDIT_CONTROL))
1960 return -EPERM;
1961 /* reject if this is not an unset and we don't allow that */
1962 if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
1963 return -EPERM;
1964 return 0;
1965}
1966
1967static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
1968 unsigned int oldsessionid, unsigned int sessionid,
1969 int rc)
1970{
1971 struct audit_buffer *ab;
1972 uid_t uid, ologinuid, nloginuid;
1973
1974 uid = from_kuid(&init_user_ns, task_uid(current));
1975 ologinuid = from_kuid(&init_user_ns, koldloginuid);
1976 nloginuid = from_kuid(&init_user_ns, kloginuid),
1977
1978 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1979 if (!ab)
1980 return;
1981 audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old "
1982 "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid,
1983 nloginuid, oldsessionid, sessionid, !rc);
1984 audit_log_end(ab);
1985}
1986
1967/** 1987/**
1968 * audit_set_loginuid - set current task's audit_context loginuid 1988 * audit_set_loginuid - set current task's audit_context loginuid
1969 * @loginuid: loginuid value 1989 * @loginuid: loginuid value
@@ -1975,37 +1995,26 @@ static atomic_t session_id = ATOMIC_INIT(0);
1975int audit_set_loginuid(kuid_t loginuid) 1995int audit_set_loginuid(kuid_t loginuid)
1976{ 1996{
1977 struct task_struct *task = current; 1997 struct task_struct *task = current;
1978 struct audit_context *context = task->audit_context; 1998 unsigned int oldsessionid, sessionid = (unsigned int)-1;
1979 unsigned int sessionid; 1999 kuid_t oldloginuid;
2000 int rc;
1980 2001
1981#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE 2002 oldloginuid = audit_get_loginuid(current);
1982 if (audit_loginuid_set(task)) 2003 oldsessionid = audit_get_sessionid(current);
1983 return -EPERM;
1984#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
1985 if (!capable(CAP_AUDIT_CONTROL))
1986 return -EPERM;
1987#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
1988 2004
1989 sessionid = atomic_inc_return(&session_id); 2005 rc = audit_set_loginuid_perm(loginuid);
1990 if (context && context->in_syscall) { 2006 if (rc)
1991 struct audit_buffer *ab; 2007 goto out;
2008
2009 /* are we setting or clearing? */
2010 if (uid_valid(loginuid))
2011 sessionid = atomic_inc_return(&session_id);
1992 2012
1993 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1994 if (ab) {
1995 audit_log_format(ab, "login pid=%d uid=%u "
1996 "old auid=%u new auid=%u"
1997 " old ses=%u new ses=%u",
1998 task->pid,
1999 from_kuid(&init_user_ns, task_uid(task)),
2000 from_kuid(&init_user_ns, task->loginuid),
2001 from_kuid(&init_user_ns, loginuid),
2002 task->sessionid, sessionid);
2003 audit_log_end(ab);
2004 }
2005 }
2006 task->sessionid = sessionid; 2013 task->sessionid = sessionid;
2007 task->loginuid = loginuid; 2014 task->loginuid = loginuid;
2008 return 0; 2015out:
2016 audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
2017 return rc;
2009} 2018}
2010 2019
2011/** 2020/**
@@ -2126,22 +2135,12 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
2126 context->ipc.has_perm = 1; 2135 context->ipc.has_perm = 1;
2127} 2136}
2128 2137
2129int __audit_bprm(struct linux_binprm *bprm) 2138void __audit_bprm(struct linux_binprm *bprm)
2130{ 2139{
2131 struct audit_aux_data_execve *ax;
2132 struct audit_context *context = current->audit_context; 2140 struct audit_context *context = current->audit_context;
2133 2141
2134 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2142 context->type = AUDIT_EXECVE;
2135 if (!ax) 2143 context->execve.argc = bprm->argc;
2136 return -ENOMEM;
2137
2138 ax->argc = bprm->argc;
2139 ax->envc = bprm->envc;
2140 ax->mm = bprm->mm;
2141 ax->d.type = AUDIT_EXECVE;
2142 ax->d.next = context->aux;
2143 context->aux = (void *)ax;
2144 return 0;
2145} 2144}
2146 2145
2147 2146
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..5253204afdca 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,8 @@
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h> 12#include <linux/page_cgroup.h>
13#include <linux/log2.h>
14#include <linux/spinlock_types.h>
13 15
14void foo(void) 16void foo(void)
15{ 17{
@@ -17,5 +19,9 @@ void foo(void)
17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 19 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 20 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); 21 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
22#ifdef CONFIG_SMP
23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
24#endif
25 DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int));
20 /* End of constants */ 26 /* End of constants */
21} 27}
diff --git a/kernel/capability.c b/kernel/capability.c
index f6c2ce5701e1..4e66bf9275b0 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -433,18 +433,6 @@ bool capable(int cap)
433EXPORT_SYMBOL(capable); 433EXPORT_SYMBOL(capable);
434 434
435/** 435/**
436 * nsown_capable - Check superior capability to one's own user_ns
437 * @cap: The capability in question
438 *
439 * Return true if the current task has the given superior capability
440 * targeted at its own user namespace.
441 */
442bool nsown_capable(int cap)
443{
444 return ns_capable(current_user_ns(), cap);
445}
446
447/**
448 * inode_capable - Check superior capability over inode 436 * inode_capable - Check superior capability over inode
449 * @inode: The inode in question 437 * @inode: The inode in question
450 * @cap: The capability in question 438 * @cap: The capability in question
@@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap)
464 452
465 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); 453 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
466} 454}
455EXPORT_SYMBOL(inode_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e0aeb32415ff..4c62513fe19f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
60#include <linux/poll.h> 60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 62#include <linux/kthread.h>
63#include <linux/file.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
@@ -124,38 +125,6 @@ struct cfent {
124}; 125};
125 126
126/* 127/*
127 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
128 * cgroup_subsys->use_id != 0.
129 */
130#define CSS_ID_MAX (65535)
131struct css_id {
132 /*
133 * The css to which this ID points. This pointer is set to valid value
134 * after cgroup is populated. If cgroup is removed, this will be NULL.
135 * This pointer is expected to be RCU-safe because destroy()
136 * is called after synchronize_rcu(). But for safe use, css_tryget()
137 * should be used for avoiding race.
138 */
139 struct cgroup_subsys_state __rcu *css;
140 /*
141 * ID of this css.
142 */
143 unsigned short id;
144 /*
145 * Depth in hierarchy which this ID belongs to.
146 */
147 unsigned short depth;
148 /*
149 * ID is freed by RCU. (and lookup routine is RCU safe.)
150 */
151 struct rcu_head rcu_head;
152 /*
153 * Hierarchy of CSS ID belongs to.
154 */
155 unsigned short stack[0]; /* Array of Length (depth+1) */
156};
157
158/*
159 * cgroup_event represents events which userspace want to receive. 128 * cgroup_event represents events which userspace want to receive.
160 */ 129 */
161struct cgroup_event { 130struct cgroup_event {
@@ -386,9 +355,6 @@ struct cgrp_cset_link {
386static struct css_set init_css_set; 355static struct css_set init_css_set;
387static struct cgrp_cset_link init_cgrp_cset_link; 356static struct cgrp_cset_link init_cgrp_cset_link;
388 357
389static int cgroup_init_idr(struct cgroup_subsys *ss,
390 struct cgroup_subsys_state *css);
391
392/* 358/*
393 * css_set_lock protects the list of css_set objects, and the chain of 359 * css_set_lock protects the list of css_set objects, and the chain of
394 * tasks off each css_set. Nests outside task->alloc_lock due to 360 * tasks off each css_set. Nests outside task->alloc_lock due to
@@ -840,8 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
840 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 806 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
841}; 807};
842 808
843static int alloc_css_id(struct cgroup_subsys_state *child_css);
844
845static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 809static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
846{ 810{
847 struct inode *inode = new_inode(sb); 811 struct inode *inode = new_inode(sb);
@@ -931,11 +895,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
931 iput(inode); 895 iput(inode);
932} 896}
933 897
934static int cgroup_delete(const struct dentry *d)
935{
936 return 1;
937}
938
939static void remove_dir(struct dentry *d) 898static void remove_dir(struct dentry *d)
940{ 899{
941 struct dentry *parent = dget(d->d_parent); 900 struct dentry *parent = dget(d->d_parent);
@@ -1522,7 +1481,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
1522{ 1481{
1523 static const struct dentry_operations cgroup_dops = { 1482 static const struct dentry_operations cgroup_dops = {
1524 .d_iput = cgroup_diput, 1483 .d_iput = cgroup_diput,
1525 .d_delete = cgroup_delete, 1484 .d_delete = always_delete_dentry,
1526 }; 1485 };
1527 1486
1528 struct inode *inode = 1487 struct inode *inode =
@@ -2038,7 +1997,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2038 1997
2039 /* @tsk either already exited or can't exit until the end */ 1998 /* @tsk either already exited or can't exit until the end */
2040 if (tsk->flags & PF_EXITING) 1999 if (tsk->flags & PF_EXITING)
2041 continue; 2000 goto next;
2042 2001
2043 /* as per above, nr_threads may decrease, but not increase. */ 2002 /* as per above, nr_threads may decrease, but not increase. */
2044 BUG_ON(i >= group_size); 2003 BUG_ON(i >= group_size);
@@ -2046,7 +2005,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2046 ent.cgrp = task_cgroup_from_root(tsk, root); 2005 ent.cgrp = task_cgroup_from_root(tsk, root);
2047 /* nothing to do if this task is already in the cgroup */ 2006 /* nothing to do if this task is already in the cgroup */
2048 if (ent.cgrp == cgrp) 2007 if (ent.cgrp == cgrp)
2049 continue; 2008 goto next;
2050 /* 2009 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc 2010 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations. 2011 * earlier, but it's good form to communicate our expectations.
@@ -2054,7 +2013,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2054 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2013 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2055 BUG_ON(retval != 0); 2014 BUG_ON(retval != 0);
2056 i++; 2015 i++;
2057 2016 next:
2058 if (!threadgroup) 2017 if (!threadgroup)
2059 break; 2018 break;
2060 } while_each_thread(leader, tsk); 2019 } while_each_thread(leader, tsk);
@@ -3187,11 +3146,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3187 3146
3188 WARN_ON_ONCE(!rcu_read_lock_held()); 3147 WARN_ON_ONCE(!rcu_read_lock_held());
3189 3148
3190 /* if first iteration, visit the leftmost descendant */ 3149 /* if first iteration, visit leftmost descendant which may be @root */
3191 if (!pos) { 3150 if (!pos)
3192 next = css_leftmost_descendant(root); 3151 return css_leftmost_descendant(root);
3193 return next != root ? next : NULL;
3194 }
3195 3152
3196 /* if we visited @root, we're done */ 3153 /* if we visited @root, we're done */
3197 if (pos == root) 3154 if (pos == root)
@@ -4034,8 +3991,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4034 struct cgroup_event *event; 3991 struct cgroup_event *event;
4035 struct cgroup_subsys_state *cfile_css; 3992 struct cgroup_subsys_state *cfile_css;
4036 unsigned int efd, cfd; 3993 unsigned int efd, cfd;
4037 struct file *efile; 3994 struct fd efile;
4038 struct file *cfile; 3995 struct fd cfile;
4039 char *endp; 3996 char *endp;
4040 int ret; 3997 int ret;
4041 3998
@@ -4058,31 +4015,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4058 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4015 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4059 INIT_WORK(&event->remove, cgroup_event_remove); 4016 INIT_WORK(&event->remove, cgroup_event_remove);
4060 4017
4061 efile = eventfd_fget(efd); 4018 efile = fdget(efd);
4062 if (IS_ERR(efile)) { 4019 if (!efile.file) {
4063 ret = PTR_ERR(efile); 4020 ret = -EBADF;
4064 goto out_kfree; 4021 goto out_kfree;
4065 } 4022 }
4066 4023
4067 event->eventfd = eventfd_ctx_fileget(efile); 4024 event->eventfd = eventfd_ctx_fileget(efile.file);
4068 if (IS_ERR(event->eventfd)) { 4025 if (IS_ERR(event->eventfd)) {
4069 ret = PTR_ERR(event->eventfd); 4026 ret = PTR_ERR(event->eventfd);
4070 goto out_put_efile; 4027 goto out_put_efile;
4071 } 4028 }
4072 4029
4073 cfile = fget(cfd); 4030 cfile = fdget(cfd);
4074 if (!cfile) { 4031 if (!cfile.file) {
4075 ret = -EBADF; 4032 ret = -EBADF;
4076 goto out_put_eventfd; 4033 goto out_put_eventfd;
4077 } 4034 }
4078 4035
4079 /* the process need read permission on control file */ 4036 /* the process need read permission on control file */
4080 /* AV: shouldn't we check that it's been opened for read instead? */ 4037 /* AV: shouldn't we check that it's been opened for read instead? */
4081 ret = inode_permission(file_inode(cfile), MAY_READ); 4038 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4082 if (ret < 0) 4039 if (ret < 0)
4083 goto out_put_cfile; 4040 goto out_put_cfile;
4084 4041
4085 event->cft = __file_cft(cfile); 4042 event->cft = __file_cft(cfile.file);
4086 if (IS_ERR(event->cft)) { 4043 if (IS_ERR(event->cft)) {
4087 ret = PTR_ERR(event->cft); 4044 ret = PTR_ERR(event->cft);
4088 goto out_put_cfile; 4045 goto out_put_cfile;
@@ -4103,7 +4060,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4103 4060
4104 ret = -EINVAL; 4061 ret = -EINVAL;
4105 event->css = cgroup_css(cgrp, event->cft->ss); 4062 event->css = cgroup_css(cgrp, event->cft->ss);
4106 cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); 4063 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4107 if (event->css && event->css == cfile_css && css_tryget(event->css)) 4064 if (event->css && event->css == cfile_css && css_tryget(event->css))
4108 ret = 0; 4065 ret = 0;
4109 4066
@@ -4121,25 +4078,25 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4121 if (ret) 4078 if (ret)
4122 goto out_put_css; 4079 goto out_put_css;
4123 4080
4124 efile->f_op->poll(efile, &event->pt); 4081 efile.file->f_op->poll(efile.file, &event->pt);
4125 4082
4126 spin_lock(&cgrp->event_list_lock); 4083 spin_lock(&cgrp->event_list_lock);
4127 list_add(&event->list, &cgrp->event_list); 4084 list_add(&event->list, &cgrp->event_list);
4128 spin_unlock(&cgrp->event_list_lock); 4085 spin_unlock(&cgrp->event_list_lock);
4129 4086
4130 fput(cfile); 4087 fdput(cfile);
4131 fput(efile); 4088 fdput(efile);
4132 4089
4133 return 0; 4090 return 0;
4134 4091
4135out_put_css: 4092out_put_css:
4136 css_put(event->css); 4093 css_put(event->css);
4137out_put_cfile: 4094out_put_cfile:
4138 fput(cfile); 4095 fdput(cfile);
4139out_put_eventfd: 4096out_put_eventfd:
4140 eventfd_ctx_put(event->eventfd); 4097 eventfd_ctx_put(event->eventfd);
4141out_put_efile: 4098out_put_efile:
4142 fput(efile); 4099 fdput(efile);
4143out_kfree: 4100out_kfree:
4144 kfree(event); 4101 kfree(event);
4145 4102
@@ -4241,21 +4198,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4241 goto err; 4198 goto err;
4242 } 4199 }
4243 } 4200 }
4244
4245 /* This cgroup is ready now */
4246 for_each_root_subsys(cgrp->root, ss) {
4247 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4248 struct css_id *id = rcu_dereference_protected(css->id, true);
4249
4250 /*
4251 * Update id->css pointer and make this css visible from
4252 * CSS ID functions. This pointer will be dereferened
4253 * from RCU-read-side without locks.
4254 */
4255 if (id)
4256 rcu_assign_pointer(id->css, css);
4257 }
4258
4259 return 0; 4201 return 0;
4260err: 4202err:
4261 cgroup_clear_dir(cgrp, subsys_mask); 4203 cgroup_clear_dir(cgrp, subsys_mask);
@@ -4324,7 +4266,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4324 css->cgroup = cgrp; 4266 css->cgroup = cgrp;
4325 css->ss = ss; 4267 css->ss = ss;
4326 css->flags = 0; 4268 css->flags = 0;
4327 css->id = NULL;
4328 4269
4329 if (cgrp->parent) 4270 if (cgrp->parent)
4330 css->parent = cgroup_css(cgrp->parent, ss); 4271 css->parent = cgroup_css(cgrp->parent, ss);
@@ -4456,12 +4397,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4456 goto err_free_all; 4397 goto err_free_all;
4457 4398
4458 init_css(css, ss, cgrp); 4399 init_css(css, ss, cgrp);
4459
4460 if (ss->use_id) {
4461 err = alloc_css_id(css);
4462 if (err)
4463 goto err_free_all;
4464 }
4465 } 4400 }
4466 4401
4467 /* 4402 /*
@@ -4926,12 +4861,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4926 4861
4927 /* our new subsystem will be attached to the dummy hierarchy. */ 4862 /* our new subsystem will be attached to the dummy hierarchy. */
4928 init_css(css, ss, cgroup_dummy_top); 4863 init_css(css, ss, cgroup_dummy_top);
4929 /* init_idr must be after init_css() because it sets css->id. */
4930 if (ss->use_id) {
4931 ret = cgroup_init_idr(ss, css);
4932 if (ret)
4933 goto err_unload;
4934 }
4935 4864
4936 /* 4865 /*
4937 * Now we need to entangle the css into the existing css_sets. unlike 4866 * Now we need to entangle the css into the existing css_sets. unlike
@@ -4997,9 +4926,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4997 4926
4998 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4927 offline_css(cgroup_css(cgroup_dummy_top, ss));
4999 4928
5000 if (ss->use_id)
5001 idr_destroy(&ss->idr);
5002
5003 /* deassign the subsys_id */ 4929 /* deassign the subsys_id */
5004 cgroup_subsys[ss->subsys_id] = NULL; 4930 cgroup_subsys[ss->subsys_id] = NULL;
5005 4931
@@ -5026,8 +4952,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
5026 /* 4952 /*
5027 * remove subsystem's css from the cgroup_dummy_top and free it - 4953 * remove subsystem's css from the cgroup_dummy_top and free it -
5028 * need to free before marking as null because ss->css_free needs 4954 * need to free before marking as null because ss->css_free needs
5029 * the cgrp->subsys pointer to find their state. note that this 4955 * the cgrp->subsys pointer to find their state.
5030 * also takes care of freeing the css_id.
5031 */ 4956 */
5032 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4957 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
5033 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4958 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@ -5098,8 +5023,6 @@ int __init cgroup_init(void)
5098 for_each_builtin_subsys(ss, i) { 5023 for_each_builtin_subsys(ss, i) {
5099 if (!ss->early_init) 5024 if (!ss->early_init)
5100 cgroup_init_subsys(ss); 5025 cgroup_init_subsys(ss);
5101 if (ss->use_id)
5102 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
5103 } 5026 }
5104 5027
5105 /* allocate id for the dummy hierarchy */ 5028 /* allocate id for the dummy hierarchy */
@@ -5519,181 +5442,6 @@ static int __init cgroup_disable(char *str)
5519} 5442}
5520__setup("cgroup_disable=", cgroup_disable); 5443__setup("cgroup_disable=", cgroup_disable);
5521 5444
5522/*
5523 * Functons for CSS ID.
5524 */
5525
5526/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5527unsigned short css_id(struct cgroup_subsys_state *css)
5528{
5529 struct css_id *cssid;
5530
5531 /*
5532 * This css_id() can return correct value when somone has refcnt
5533 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5534 * it's unchanged until freed.
5535 */
5536 cssid = rcu_dereference_raw(css->id);
5537
5538 if (cssid)
5539 return cssid->id;
5540 return 0;
5541}
5542EXPORT_SYMBOL_GPL(css_id);
5543
5544/**
5545 * css_is_ancestor - test "root" css is an ancestor of "child"
5546 * @child: the css to be tested.
5547 * @root: the css supporsed to be an ancestor of the child.
5548 *
5549 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
5550 * this function reads css->id, the caller must hold rcu_read_lock().
5551 * But, considering usual usage, the csses should be valid objects after test.
5552 * Assuming that the caller will do some action to the child if this returns
5553 * returns true, the caller must take "child";s reference count.
5554 * If "child" is valid object and this returns true, "root" is valid, too.
5555 */
5556
5557bool css_is_ancestor(struct cgroup_subsys_state *child,
5558 const struct cgroup_subsys_state *root)
5559{
5560 struct css_id *child_id;
5561 struct css_id *root_id;
5562
5563 child_id = rcu_dereference(child->id);
5564 if (!child_id)
5565 return false;
5566 root_id = rcu_dereference(root->id);
5567 if (!root_id)
5568 return false;
5569 if (child_id->depth < root_id->depth)
5570 return false;
5571 if (child_id->stack[root_id->depth] != root_id->id)
5572 return false;
5573 return true;
5574}
5575
5576void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5577{
5578 struct css_id *id = rcu_dereference_protected(css->id, true);
5579
5580 /* When this is called before css_id initialization, id can be NULL */
5581 if (!id)
5582 return;
5583
5584 BUG_ON(!ss->use_id);
5585
5586 rcu_assign_pointer(id->css, NULL);
5587 rcu_assign_pointer(css->id, NULL);
5588 spin_lock(&ss->id_lock);
5589 idr_remove(&ss->idr, id->id);
5590 spin_unlock(&ss->id_lock);
5591 kfree_rcu(id, rcu_head);
5592}
5593EXPORT_SYMBOL_GPL(free_css_id);
5594
5595/*
5596 * This is called by init or create(). Then, calls to this function are
5597 * always serialized (By cgroup_mutex() at create()).
5598 */
5599
5600static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5601{
5602 struct css_id *newid;
5603 int ret, size;
5604
5605 BUG_ON(!ss->use_id);
5606
5607 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
5608 newid = kzalloc(size, GFP_KERNEL);
5609 if (!newid)
5610 return ERR_PTR(-ENOMEM);
5611
5612 idr_preload(GFP_KERNEL);
5613 spin_lock(&ss->id_lock);
5614 /* Don't use 0. allocates an ID of 1-65535 */
5615 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5616 spin_unlock(&ss->id_lock);
5617 idr_preload_end();
5618
5619 /* Returns error when there are no free spaces for new ID.*/
5620 if (ret < 0)
5621 goto err_out;
5622
5623 newid->id = ret;
5624 newid->depth = depth;
5625 return newid;
5626err_out:
5627 kfree(newid);
5628 return ERR_PTR(ret);
5629
5630}
5631
5632static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5633 struct cgroup_subsys_state *rootcss)
5634{
5635 struct css_id *newid;
5636
5637 spin_lock_init(&ss->id_lock);
5638 idr_init(&ss->idr);
5639
5640 newid = get_new_cssid(ss, 0);
5641 if (IS_ERR(newid))
5642 return PTR_ERR(newid);
5643
5644 newid->stack[0] = newid->id;
5645 RCU_INIT_POINTER(newid->css, rootcss);
5646 RCU_INIT_POINTER(rootcss->id, newid);
5647 return 0;
5648}
5649
5650static int alloc_css_id(struct cgroup_subsys_state *child_css)
5651{
5652 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5653 struct css_id *child_id, *parent_id;
5654 int i, depth;
5655
5656 parent_id = rcu_dereference_protected(parent_css->id, true);
5657 depth = parent_id->depth + 1;
5658
5659 child_id = get_new_cssid(child_css->ss, depth);
5660 if (IS_ERR(child_id))
5661 return PTR_ERR(child_id);
5662
5663 for (i = 0; i < depth; i++)
5664 child_id->stack[i] = parent_id->stack[i];
5665 child_id->stack[depth] = child_id->id;
5666 /*
5667 * child_id->css pointer will be set after this cgroup is available
5668 * see cgroup_populate_dir()
5669 */
5670 rcu_assign_pointer(child_css->id, child_id);
5671
5672 return 0;
5673}
5674
5675/**
5676 * css_lookup - lookup css by id
5677 * @ss: cgroup subsys to be looked into.
5678 * @id: the id
5679 *
5680 * Returns pointer to cgroup_subsys_state if there is valid one with id.
5681 * NULL if not. Should be called under rcu_read_lock()
5682 */
5683struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5684{
5685 struct css_id *cssid = NULL;
5686
5687 BUG_ON(!ss->use_id);
5688 cssid = idr_find(&ss->idr, id);
5689
5690 if (unlikely(!cssid))
5691 return NULL;
5692
5693 return rcu_dereference(cssid->css);
5694}
5695EXPORT_SYMBOL_GPL(css_lookup);
5696
5697/** 5445/**
5698 * css_from_dir - get corresponding css from the dentry of a cgroup dir 5446 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5699 * @dentry: directory dentry of interest 5447 * @dentry: directory dentry of interest
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 247091bf0587..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -51,6 +51,15 @@ void context_tracking_user_enter(void)
51 unsigned long flags; 51 unsigned long flags;
52 52
53 /* 53 /*
54 * Repeat the user_enter() check here because some archs may be calling
55 * this from asm and if no CPU needs context tracking, they shouldn't
56 * go further. Repeat the check here until they support the static key
57 * check.
58 */
59 if (!static_key_false(&context_tracking_enabled))
60 return;
61
62 /*
54 * Some contexts may involve an exception occuring in an irq, 63 * Some contexts may involve an exception occuring in an irq,
55 * leading to that nesting: 64 * leading to that nesting:
56 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() 65 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
@@ -111,7 +120,7 @@ void context_tracking_user_enter(void)
111 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
112 * calling the scheduler. 121 * calling the scheduler.
113 */ 122 */
114void __sched notrace preempt_schedule_context(void) 123asmlinkage void __sched notrace preempt_schedule_context(void)
115{ 124{
116 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
117 126
@@ -151,6 +160,9 @@ void context_tracking_user_exit(void)
151{ 160{
152 unsigned long flags; 161 unsigned long flags;
153 162
163 if (!static_key_false(&context_tracking_enabled))
164 return;
165
154 if (in_interrupt()) 166 if (in_interrupt())
155 return; 167 return;
156 168
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..deff2e693766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,8 +306,28 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
306 __func__, cpu); 306 __func__, cpu);
307 goto out_release; 307 goto out_release;
308 } 308 }
309
310 /*
311 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
312 * and RCU users of this state to go away such that all new such users
313 * will observe it.
314 *
315 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
316 * not imply sync_sched(), so explicitly call both.
317 *
318 * Do sync before park smpboot threads to take care the rcu boost case.
319 */
320#ifdef CONFIG_PREEMPT
321 synchronize_sched();
322#endif
323 synchronize_rcu();
324
309 smpboot_park_threads(cpu); 325 smpboot_park_threads(cpu);
310 326
327 /*
328 * So now all preempt/rcu users must observe !cpu_active().
329 */
330
311 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 331 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
312 if (err) { 332 if (err) {
313 /* CPU didn't die: tell everyone. Can't complain. */ 333 /* CPU didn't die: tell everyone. Can't complain. */
@@ -420,11 +440,6 @@ int cpu_up(unsigned int cpu)
420{ 440{
421 int err = 0; 441 int err = 0;
422 442
423#ifdef CONFIG_MEMORY_HOTPLUG
424 int nid;
425 pg_data_t *pgdat;
426#endif
427
428 if (!cpu_possible(cpu)) { 443 if (!cpu_possible(cpu)) {
429 printk(KERN_ERR "can't online cpu %d because it is not " 444 printk(KERN_ERR "can't online cpu %d because it is not "
430 "configured as may-hotadd at boot time\n", cpu); 445 "configured as may-hotadd at boot time\n", cpu);
@@ -435,27 +450,9 @@ int cpu_up(unsigned int cpu)
435 return -EINVAL; 450 return -EINVAL;
436 } 451 }
437 452
438#ifdef CONFIG_MEMORY_HOTPLUG 453 err = try_online_node(cpu_to_node(cpu));
439 nid = cpu_to_node(cpu); 454 if (err)
440 if (!node_online(nid)) { 455 return err;
441 err = mem_online_node(nid);
442 if (err)
443 return err;
444 }
445
446 pgdat = NODE_DATA(nid);
447 if (!pgdat) {
448 printk(KERN_ERR
449 "Can't online cpu %d due to NULL pgdat\n", cpu);
450 return -ENOMEM;
451 }
452
453 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
454 mutex_lock(&zonelists_mutex);
455 build_all_zonelists(NULL, NULL);
456 mutex_unlock(&zonelists_mutex);
457 }
458#endif
459 456
460 cpu_maps_update_begin(); 457 cpu_maps_update_begin();
461 458
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
44 rcu_idle_enter(); 44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id()); 45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable(); 46 local_irq_enable();
47 while (!need_resched()) 47 while (!tif_need_resched())
48 cpu_relax(); 48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit(); 50 rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll(); 93 cpu_idle_poll();
94 } else { 94 } else {
95 current_clr_polling(); 95 if (!current_clr_polling_and_test()) {
96 if (!need_resched()) {
97 stop_critical_timings(); 96 stop_critical_timings();
98 rcu_idle_enter(); 97 rcu_idle_enter();
99 arch_cpu_idle(); 98 arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
103 } else { 102 } else {
104 local_irq_enable(); 103 local_irq_enable();
105 } 104 }
106 current_set_polling(); 105 __current_set_polling();
107 } 106 }
108 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
109 } 115 }
110 tick_nohz_idle_exit(); 116 tick_nohz_idle_exit();
111 schedule_preempt_disabled(); 117 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
129 */ 135 */
130 boot_init_stack_canary(); 136 boot_init_stack_canary();
131#endif 137#endif
132 current_set_polling(); 138 __current_set_polling();
133 arch_cpu_idle_prepare(); 139 arch_cpu_idle_prepare();
134 cpu_idle_loop(); 140 cpu_idle_loop();
135} 141}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..7d2f35e5df2f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
575 raw_spin_lock(&dbg_slave_lock); 575 raw_spin_lock(&dbg_slave_lock);
576 576
577#ifdef CONFIG_SMP 577#ifdef CONFIG_SMP
578 /* If send_ready set, slaves are already waiting */
579 if (ks->send_ready)
580 atomic_set(ks->send_ready, 1);
581
578 /* Signal the other CPUs to enter kgdb_wait() */ 582 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup) 583 else if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags); 584 kgdb_roundup_cpus(flags);
581#endif 585#endif
582 586
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678 if (arch_kgdb_ops.enable_nmi) 682 if (arch_kgdb_ops.enable_nmi)
679 arch_kgdb_ops.enable_nmi(0); 683 arch_kgdb_ops.enable_nmi(0);
680 684
685 memset(ks, 0, sizeof(struct kgdb_state));
681 ks->cpu = raw_smp_processor_id(); 686 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector; 687 ks->ex_vector = evector;
683 ks->signo = signo; 688 ks->signo = signo;
684 ks->err_code = ecode; 689 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs; 690 ks->linux_regs = regs;
687 691
688 if (kgdb_reenter_check(ks)) 692 if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
732 return 1; 736 return 1;
733} 737}
734 738
739int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
740{
741#ifdef CONFIG_SMP
742 if (!kgdb_io_ready(0) || !send_ready)
743 return 1;
744
745 if (kgdb_info[cpu].enter_kgdb == 0) {
746 struct kgdb_state kgdb_var;
747 struct kgdb_state *ks = &kgdb_var;
748
749 memset(ks, 0, sizeof(struct kgdb_state));
750 ks->cpu = cpu;
751 ks->ex_vector = trapnr;
752 ks->signo = SIGTRAP;
753 ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI;
754 ks->linux_regs = regs;
755 ks->send_ready = send_ready;
756 kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
757 return 0;
758 }
759#endif
760 return 1;
761}
762
735static void kgdb_console_write(struct console *co, const char *s, 763static void kgdb_console_write(struct console *co, const char *s,
736 unsigned count) 764 unsigned count)
737{ 765{
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 2235967e78b0..572aa4f5677c 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
26 unsigned long threadid; 26 unsigned long threadid;
27 long kgdb_usethreadid; 27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs; 28 struct pt_regs *linux_regs;
29 atomic_t *send_ready;
29}; 30};
30 31
31/* Exception state values */ 32/* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
74extern int kdb_parse(const char *cmdstr); 75extern int kdb_parse(const char *cmdstr);
75extern int kdb_common_init_state(struct kgdb_state *ks); 76extern int kdb_common_init_state(struct kgdb_state *ks);
76extern int kdb_common_deinit_state(void); 77extern int kdb_common_deinit_state(void);
78#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
77#else /* ! CONFIG_KGDB_KDB */ 79#else /* ! CONFIG_KGDB_KDB */
78static inline int kdb_stub(struct kgdb_state *ks) 80static inline int kdb_stub(struct kgdb_state *ks)
79{ 81{
80 return DBG_PASS_EVENT; 82 return DBG_PASS_EVENT;
81} 83}
84#define KGDB_KDB_REASON_SYSTEM_NMI 0
82#endif /* CONFIG_KGDB_KDB */ 85#endif /* CONFIG_KGDB_KDB */
83 86
84#endif /* _DEBUG_CORE_H_ */ 87#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 328d18ef31e4..8859ca34dcfe 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
69 if (atomic_read(&kgdb_setting_breakpoint)) 69 if (atomic_read(&kgdb_setting_breakpoint))
70 reason = KDB_REASON_KEYBOARD; 70 reason = KDB_REASON_KEYBOARD;
71 71
72 if (in_nmi()) 72 if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
73 reason = KDB_REASON_SYSTEM_NMI;
74
75 else if (in_nmi())
73 reason = KDB_REASON_NMI; 76 reason = KDB_REASON_NMI;
74 77
75 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 78 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 00eb8f7fbf41..0b097c8a1e50 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1200 instruction_pointer(regs)); 1200 instruction_pointer(regs));
1201 kdb_dumpregs(regs); 1201 kdb_dumpregs(regs);
1202 break; 1202 break;
1203 case KDB_REASON_SYSTEM_NMI:
1204 kdb_printf("due to System NonMaskable Interrupt\n");
1205 break;
1203 case KDB_REASON_NMI: 1206 case KDB_REASON_NMI:
1204 kdb_printf("due to NonMaskable Interrupt @ " 1207 kdb_printf("due to NonMaskable Interrupt @ "
1205 kdb_machreg_fmt "\n", 1208 kdb_machreg_fmt "\n",
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d473988c1d0b..54996b71e66d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
108 struct timespec ts; 108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled; 109 cputime_t utime, stime, stimescaled, utimescaled;
110 110
111 /* Though tsk->delays accessed later, early exit avoids
112 * unnecessary returning of other data
113 */
114 if (!tsk->delays)
115 goto done;
116
117 tmp = (s64)d->cpu_run_real_total; 111 tmp = (s64)d->cpu_run_real_total;
118 task_cputime(tsk, &utime, &stime); 112 task_cputime(tsk, &utime, &stime);
119 cputime_to_timespec(utime + stime, &ts); 113 cputime_to_timespec(utime + stime, &ts);
@@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
158 d->freepages_count += tsk->delays->freepages_count; 152 d->freepages_count += tsk->delays->freepages_count;
159 spin_unlock_irqrestore(&tsk->delays->lock, flags); 153 spin_unlock_irqrestore(&tsk->delays->lock, flags);
160 154
161done:
162 return 0; 155 return 0;
163} 156}
164 157
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915efef66d..e556751d15d9 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
1#include <linux/elf.h> 1#include <linux/elf.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4 4#include <linux/binfmts.h>
5#include <asm/elf.h>
6
7 5
8Elf_Half __weak elf_core_extra_phdrs(void) 6Elf_Half __weak elf_core_extra_phdrs(void)
9{ 7{
10 return 0; 8 return 0;
11} 9}
12 10
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, 11int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
14 unsigned long limit)
15{ 12{
16 return 1; 13 return 1;
17} 14}
18 15
19int __weak elf_core_write_extra_data(struct file *file, size_t *size, 16int __weak elf_core_write_extra_data(struct coredump_params *cprm)
20 unsigned long limit)
21{ 17{
22 return 1; 18 return 1;
23} 19}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2207efc941d1..d724e7757cd1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
177 177
178static atomic_t perf_sample_allowed_ns __read_mostly = 178static int perf_sample_allowed_ns __read_mostly =
179 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); 179 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
180 180
181void update_perf_cpu_limits(void) 181void update_perf_cpu_limits(void)
182{ 182{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
184 184
185 tmp *= sysctl_perf_cpu_time_max_percent; 185 tmp *= sysctl_perf_cpu_time_max_percent;
186 do_div(tmp, 100); 186 do_div(tmp, 100);
187 atomic_set(&perf_sample_allowed_ns, tmp); 187 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
188} 188}
189 189
190static int perf_rotate_context(struct perf_cpu_context *cpuctx); 190static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
193 void __user *buffer, size_t *lenp, 193 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 194 loff_t *ppos)
195{ 195{
196 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 196 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
197 197
198 if (ret || !write) 198 if (ret || !write)
199 return ret; 199 return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
228 * we detect that events are taking too long. 228 * we detect that events are taking too long.
229 */ 229 */
230#define NR_ACCUMULATED_SAMPLES 128 230#define NR_ACCUMULATED_SAMPLES 128
231DEFINE_PER_CPU(u64, running_sample_length); 231static DEFINE_PER_CPU(u64, running_sample_length);
232 232
233void perf_sample_event_took(u64 sample_len_ns) 233void perf_sample_event_took(u64 sample_len_ns)
234{ 234{
235 u64 avg_local_sample_len; 235 u64 avg_local_sample_len;
236 u64 local_samples_len; 236 u64 local_samples_len;
237 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
237 238
238 if (atomic_read(&perf_sample_allowed_ns) == 0) 239 if (allowed_ns == 0)
239 return; 240 return;
240 241
241 /* decay the counter by 1 average sample */ 242 /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
251 */ 252 */
252 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
253 254
254 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) 255 if (avg_local_sample_len <= allowed_ns)
255 return; 256 return;
256 257
257 if (max_samples_per_tick <= 1) 258 if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
262 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 263 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
263 264
264 printk_ratelimited(KERN_WARNING 265 printk_ratelimited(KERN_WARNING
265 "perf samples too long (%lld > %d), lowering " 266 "perf samples too long (%lld > %lld), lowering "
266 "kernel.perf_event_max_sample_rate to %d\n", 267 "kernel.perf_event_max_sample_rate to %d\n",
267 avg_local_sample_len, 268 avg_local_sample_len, allowed_ns,
268 atomic_read(&perf_sample_allowed_ns),
269 sysctl_perf_event_sample_rate); 269 sysctl_perf_event_sample_rate);
270 270
271 update_perf_cpu_limits(); 271 update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
899 put_ctx(ctx->parent_ctx); 899 put_ctx(ctx->parent_ctx);
900 ctx->parent_ctx = NULL; 900 ctx->parent_ctx = NULL;
901 } 901 }
902 ctx->generation++;
902} 903}
903 904
904static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 905static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1136 ctx->nr_events++; 1137 ctx->nr_events++;
1137 if (event->attr.inherit_stat) 1138 if (event->attr.inherit_stat)
1138 ctx->nr_stat++; 1139 ctx->nr_stat++;
1140
1141 ctx->generation++;
1139} 1142}
1140 1143
1141/* 1144/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
1201 if (sample_type & PERF_SAMPLE_DATA_SRC) 1204 if (sample_type & PERF_SAMPLE_DATA_SRC)
1202 size += sizeof(data->data_src.val); 1205 size += sizeof(data->data_src.val);
1203 1206
1207 if (sample_type & PERF_SAMPLE_TRANSACTION)
1208 size += sizeof(data->txn);
1209
1204 event->header_size = size; 1210 event->header_size = size;
1205} 1211}
1206 1212
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1310 */ 1316 */
1311 if (event->state > PERF_EVENT_STATE_OFF) 1317 if (event->state > PERF_EVENT_STATE_OFF)
1312 event->state = PERF_EVENT_STATE_OFF; 1318 event->state = PERF_EVENT_STATE_OFF;
1319
1320 ctx->generation++;
1313} 1321}
1314 1322
1315static void perf_group_detach(struct perf_event *event) 1323static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2146} 2154}
2147 2155
2148/* 2156/*
2149 * Test whether two contexts are equivalent, i.e. whether they 2157 * Test whether two contexts are equivalent, i.e. whether they have both been
2150 * have both been cloned from the same version of the same context 2158 * cloned from the same version of the same context.
2151 * and they both have the same number of enabled events. 2159 *
2152 * If the number of enabled events is the same, then the set 2160 * Equivalence is measured using a generation number in the context that is
2153 * of enabled events should be the same, because these are both 2161 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2154 * inherited contexts, therefore we can't access individual events 2162 * and list_del_event().
2155 * in them directly with an fd; we can only enable/disable all
2156 * events via prctl, or enable/disable all events in a family
2157 * via ioctl, which will have the same effect on both contexts.
2158 */ 2163 */
2159static int context_equiv(struct perf_event_context *ctx1, 2164static int context_equiv(struct perf_event_context *ctx1,
2160 struct perf_event_context *ctx2) 2165 struct perf_event_context *ctx2)
2161{ 2166{
2162 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 2167 /* Pinning disables the swap optimization */
2163 && ctx1->parent_gen == ctx2->parent_gen 2168 if (ctx1->pin_count || ctx2->pin_count)
2164 && !ctx1->pin_count && !ctx2->pin_count; 2169 return 0;
2170
2171 /* If ctx1 is the parent of ctx2 */
2172 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2173 return 1;
2174
2175 /* If ctx2 is the parent of ctx1 */
2176 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2177 return 1;
2178
2179 /*
2180 * If ctx1 and ctx2 have the same parent; we flatten the parent
2181 * hierarchy, see perf_event_init_context().
2182 */
2183 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2184 ctx1->parent_gen == ctx2->parent_gen)
2185 return 1;
2186
2187 /* Unmatched */
2188 return 0;
2165} 2189}
2166 2190
2167static void __perf_event_sync_stat(struct perf_event *event, 2191static void __perf_event_sync_stat(struct perf_event *event,
@@ -2210,9 +2234,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
2210 perf_event_update_userpage(next_event); 2234 perf_event_update_userpage(next_event);
2211} 2235}
2212 2236
2213#define list_next_entry(pos, member) \
2214 list_entry(pos->member.next, typeof(*pos), member)
2215
2216static void perf_event_sync_stat(struct perf_event_context *ctx, 2237static void perf_event_sync_stat(struct perf_event_context *ctx,
2217 struct perf_event_context *next_ctx) 2238 struct perf_event_context *next_ctx)
2218{ 2239{
@@ -2244,7 +2265,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2244{ 2265{
2245 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2266 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2246 struct perf_event_context *next_ctx; 2267 struct perf_event_context *next_ctx;
2247 struct perf_event_context *parent; 2268 struct perf_event_context *parent, *next_parent;
2248 struct perf_cpu_context *cpuctx; 2269 struct perf_cpu_context *cpuctx;
2249 int do_switch = 1; 2270 int do_switch = 1;
2250 2271
@@ -2256,10 +2277,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2256 return; 2277 return;
2257 2278
2258 rcu_read_lock(); 2279 rcu_read_lock();
2259 parent = rcu_dereference(ctx->parent_ctx);
2260 next_ctx = next->perf_event_ctxp[ctxn]; 2280 next_ctx = next->perf_event_ctxp[ctxn];
2261 if (parent && next_ctx && 2281 if (!next_ctx)
2262 rcu_dereference(next_ctx->parent_ctx) == parent) { 2282 goto unlock;
2283
2284 parent = rcu_dereference(ctx->parent_ctx);
2285 next_parent = rcu_dereference(next_ctx->parent_ctx);
2286
2287 /* If neither context have a parent context; they cannot be clones. */
2288 if (!parent && !next_parent)
2289 goto unlock;
2290
2291 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2263 /* 2292 /*
2264 * Looks like the two contexts are clones, so we might be 2293 * Looks like the two contexts are clones, so we might be
2265 * able to optimize the context switch. We lock both 2294 * able to optimize the context switch. We lock both
@@ -2287,6 +2316,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2287 raw_spin_unlock(&next_ctx->lock); 2316 raw_spin_unlock(&next_ctx->lock);
2288 raw_spin_unlock(&ctx->lock); 2317 raw_spin_unlock(&ctx->lock);
2289 } 2318 }
2319unlock:
2290 rcu_read_unlock(); 2320 rcu_read_unlock();
2291 2321
2292 if (do_switch) { 2322 if (do_switch) {
@@ -3660,6 +3690,26 @@ static void calc_timer_values(struct perf_event *event,
3660 *running = ctx_time - event->tstamp_running; 3690 *running = ctx_time - event->tstamp_running;
3661} 3691}
3662 3692
3693static void perf_event_init_userpage(struct perf_event *event)
3694{
3695 struct perf_event_mmap_page *userpg;
3696 struct ring_buffer *rb;
3697
3698 rcu_read_lock();
3699 rb = rcu_dereference(event->rb);
3700 if (!rb)
3701 goto unlock;
3702
3703 userpg = rb->user_page;
3704
3705 /* Allow new userspace to detect that bit 0 is deprecated */
3706 userpg->cap_bit0_is_deprecated = 1;
3707 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
3708
3709unlock:
3710 rcu_read_unlock();
3711}
3712
3663void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 3713void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3664{ 3714{
3665} 3715}
@@ -4044,6 +4094,7 @@ again:
4044 ring_buffer_attach(event, rb); 4094 ring_buffer_attach(event, rb);
4045 rcu_assign_pointer(event->rb, rb); 4095 rcu_assign_pointer(event->rb, rb);
4046 4096
4097 perf_event_init_userpage(event);
4047 perf_event_update_userpage(event); 4098 perf_event_update_userpage(event);
4048 4099
4049unlock: 4100unlock:
@@ -4551,6 +4602,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4551 if (sample_type & PERF_SAMPLE_DATA_SRC) 4602 if (sample_type & PERF_SAMPLE_DATA_SRC)
4552 perf_output_put(handle, data->data_src.val); 4603 perf_output_put(handle, data->data_src.val);
4553 4604
4605 if (sample_type & PERF_SAMPLE_TRANSACTION)
4606 perf_output_put(handle, data->txn);
4607
4554 if (!event->attr.watermark) { 4608 if (!event->attr.watermark) {
4555 int wakeup_events = event->attr.wakeup_events; 4609 int wakeup_events = event->attr.wakeup_events;
4556 4610
@@ -5039,6 +5093,7 @@ static void perf_event_mmap_output(struct perf_event *event,
5039 mmap_event->event_id.header.size += sizeof(mmap_event->maj); 5093 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5040 mmap_event->event_id.header.size += sizeof(mmap_event->min); 5094 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5041 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 5095 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5096 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5042 } 5097 }
5043 5098
5044 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5099 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5078,27 +5133,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5078 unsigned int size; 5133 unsigned int size;
5079 char tmp[16]; 5134 char tmp[16];
5080 char *buf = NULL; 5135 char *buf = NULL;
5081 const char *name; 5136 char *name;
5082
5083 memset(tmp, 0, sizeof(tmp));
5084 5137
5085 if (file) { 5138 if (file) {
5086 struct inode *inode; 5139 struct inode *inode;
5087 dev_t dev; 5140 dev_t dev;
5141
5142 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5143 if (!buf) {
5144 name = "//enomem";
5145 goto cpy_name;
5146 }
5088 /* 5147 /*
5089 * d_path works from the end of the rb backwards, so we 5148 * d_path() works from the end of the rb backwards, so we
5090 * need to add enough zero bytes after the string to handle 5149 * need to add enough zero bytes after the string to handle
5091 * the 64bit alignment we do later. 5150 * the 64bit alignment we do later.
5092 */ 5151 */
5093 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 5152 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5094 if (!buf) {
5095 name = strncpy(tmp, "//enomem", sizeof(tmp));
5096 goto got_name;
5097 }
5098 name = d_path(&file->f_path, buf, PATH_MAX);
5099 if (IS_ERR(name)) { 5153 if (IS_ERR(name)) {
5100 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5154 name = "//toolong";
5101 goto got_name; 5155 goto cpy_name;
5102 } 5156 }
5103 inode = file_inode(vma->vm_file); 5157 inode = file_inode(vma->vm_file);
5104 dev = inode->i_sb->s_dev; 5158 dev = inode->i_sb->s_dev;
@@ -5106,34 +5160,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5106 gen = inode->i_generation; 5160 gen = inode->i_generation;
5107 maj = MAJOR(dev); 5161 maj = MAJOR(dev);
5108 min = MINOR(dev); 5162 min = MINOR(dev);
5109 5163 goto got_name;
5110 } else { 5164 } else {
5111 if (arch_vma_name(mmap_event->vma)) { 5165 name = (char *)arch_vma_name(vma);
5112 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5166 if (name)
5113 sizeof(tmp) - 1); 5167 goto cpy_name;
5114 tmp[sizeof(tmp) - 1] = '\0';
5115 goto got_name;
5116 }
5117 5168
5118 if (!vma->vm_mm) { 5169 if (vma->vm_start <= vma->vm_mm->start_brk &&
5119 name = strncpy(tmp, "[vdso]", sizeof(tmp));
5120 goto got_name;
5121 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
5122 vma->vm_end >= vma->vm_mm->brk) { 5170 vma->vm_end >= vma->vm_mm->brk) {
5123 name = strncpy(tmp, "[heap]", sizeof(tmp)); 5171 name = "[heap]";
5124 goto got_name; 5172 goto cpy_name;
5125 } else if (vma->vm_start <= vma->vm_mm->start_stack && 5173 }
5174 if (vma->vm_start <= vma->vm_mm->start_stack &&
5126 vma->vm_end >= vma->vm_mm->start_stack) { 5175 vma->vm_end >= vma->vm_mm->start_stack) {
5127 name = strncpy(tmp, "[stack]", sizeof(tmp)); 5176 name = "[stack]";
5128 goto got_name; 5177 goto cpy_name;
5129 } 5178 }
5130 5179
5131 name = strncpy(tmp, "//anon", sizeof(tmp)); 5180 name = "//anon";
5132 goto got_name; 5181 goto cpy_name;
5133 } 5182 }
5134 5183
5184cpy_name:
5185 strlcpy(tmp, name, sizeof(tmp));
5186 name = tmp;
5135got_name: 5187got_name:
5136 size = ALIGN(strlen(name)+1, sizeof(u64)); 5188 /*
5189 * Since our buffer works in 8 byte units we need to align our string
5190 * size to a multiple of 8. However, we must guarantee the tail end is
5191 * zero'd out to avoid leaking random bits to userspace.
5192 */
5193 size = strlen(name)+1;
5194 while (!IS_ALIGNED(size, sizeof(u64)))
5195 name[size++] = '\0';
5137 5196
5138 mmap_event->file_name = name; 5197 mmap_event->file_name = name;
5139 mmap_event->file_size = size; 5198 mmap_event->file_size = size;
@@ -6270,6 +6329,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
6270 6329
6271 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6330 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6272} 6331}
6332static DEVICE_ATTR_RO(type);
6273 6333
6274static ssize_t 6334static ssize_t
6275perf_event_mux_interval_ms_show(struct device *dev, 6335perf_event_mux_interval_ms_show(struct device *dev,
@@ -6314,17 +6374,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
6314 6374
6315 return count; 6375 return count;
6316} 6376}
6377static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6317 6378
6318static struct device_attribute pmu_dev_attrs[] = { 6379static struct attribute *pmu_dev_attrs[] = {
6319 __ATTR_RO(type), 6380 &dev_attr_type.attr,
6320 __ATTR_RW(perf_event_mux_interval_ms), 6381 &dev_attr_perf_event_mux_interval_ms.attr,
6321 __ATTR_NULL, 6382 NULL,
6322}; 6383};
6384ATTRIBUTE_GROUPS(pmu_dev);
6323 6385
6324static int pmu_bus_running; 6386static int pmu_bus_running;
6325static struct bus_type pmu_bus = { 6387static struct bus_type pmu_bus = {
6326 .name = "event_source", 6388 .name = "event_source",
6327 .dev_attrs = pmu_dev_attrs, 6389 .dev_groups = pmu_dev_groups,
6328}; 6390};
6329 6391
6330static void pmu_dev_release(struct device *dev) 6392static void pmu_dev_release(struct device *dev)
@@ -6745,6 +6807,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6745 if (ret) 6807 if (ret)
6746 return -EFAULT; 6808 return -EFAULT;
6747 6809
6810 /* disabled for now */
6811 if (attr->mmap2)
6812 return -EINVAL;
6813
6748 if (attr->__reserved_1) 6814 if (attr->__reserved_1)
6749 return -EINVAL; 6815 return -EINVAL;
6750 6816
@@ -7100,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open,
7100 } 7166 }
7101 7167
7102 perf_install_in_context(ctx, event, event->cpu); 7168 perf_install_in_context(ctx, event, event->cpu);
7103 ++ctx->generation;
7104 perf_unpin_context(ctx); 7169 perf_unpin_context(ctx);
7105 mutex_unlock(&ctx->mutex); 7170 mutex_unlock(&ctx->mutex);
7106 7171
@@ -7183,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7183 WARN_ON_ONCE(ctx->parent_ctx); 7248 WARN_ON_ONCE(ctx->parent_ctx);
7184 mutex_lock(&ctx->mutex); 7249 mutex_lock(&ctx->mutex);
7185 perf_install_in_context(ctx, event, cpu); 7250 perf_install_in_context(ctx, event, cpu);
7186 ++ctx->generation;
7187 perf_unpin_context(ctx); 7251 perf_unpin_context(ctx);
7188 mutex_unlock(&ctx->mutex); 7252 mutex_unlock(&ctx->mutex);
7189 7253
@@ -7212,15 +7276,15 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7212 perf_remove_from_context(event); 7276 perf_remove_from_context(event);
7213 unaccount_event_cpu(event, src_cpu); 7277 unaccount_event_cpu(event, src_cpu);
7214 put_ctx(src_ctx); 7278 put_ctx(src_ctx);
7215 list_add(&event->event_entry, &events); 7279 list_add(&event->migrate_entry, &events);
7216 } 7280 }
7217 mutex_unlock(&src_ctx->mutex); 7281 mutex_unlock(&src_ctx->mutex);
7218 7282
7219 synchronize_rcu(); 7283 synchronize_rcu();
7220 7284
7221 mutex_lock(&dst_ctx->mutex); 7285 mutex_lock(&dst_ctx->mutex);
7222 list_for_each_entry_safe(event, tmp, &events, event_entry) { 7286 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7223 list_del(&event->event_entry); 7287 list_del(&event->migrate_entry);
7224 if (event->state >= PERF_EVENT_STATE_OFF) 7288 if (event->state >= PERF_EVENT_STATE_OFF)
7225 event->state = PERF_EVENT_STATE_INACTIVE; 7289 event->state = PERF_EVENT_STATE_INACTIVE;
7226 account_event_cpu(event, dst_cpu); 7290 account_event_cpu(event, dst_cpu);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
82} 82}
83 83
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned int \ 85static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 86func_name(struct perf_output_handle *handle, \
87 const void *buf, unsigned int len) \ 87 const void *buf, unsigned long len) \
88{ \ 88{ \
89 unsigned long size, written; \ 89 unsigned long size, written; \
90 \ 90 \
91 do { \ 91 do { \
92 size = min_t(unsigned long, handle->size, len); \ 92 size = min(handle->size, len); \
93 \
94 written = memcpy_func(handle->addr, buf, size); \ 93 written = memcpy_func(handle->addr, buf, size); \
94 written = size - written; \
95 \ 95 \
96 len -= written; \ 96 len -= written; \
97 handle->addr += written; \ 97 handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
110 return len; \ 110 return len; \
111} 111}
112 112
113static inline int memcpy_common(void *dst, const void *src, size_t n) 113static inline unsigned long
114memcpy_common(void *dst, const void *src, unsigned long n)
114{ 115{
115 memcpy(dst, src, n); 116 memcpy(dst, src, n);
116 return n; 117 return 0;
117} 118}
118 119
119DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) 120DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
120 121
121#define MEMCPY_SKIP(dst, src, n) (n) 122static inline unsigned long
123memcpy_skip(void *dst, const void *src, unsigned long n)
124{
125 return 0;
126}
122 127
123DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) 128DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
124 129
125#ifndef arch_perf_out_copy_user 130#ifndef arch_perf_out_copy_user
126#define arch_perf_out_copy_user __copy_from_user_inatomic 131#define arch_perf_out_copy_user arch_perf_out_copy_user
132
133static inline unsigned long
134arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
135{
136 unsigned long ret;
137
138 pagefault_disable();
139 ret = __copy_from_user_inatomic(dst, src, n);
140 pagefault_enable();
141
142 return ret;
143}
127#endif 144#endif
128 145
129DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) 146DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b5..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
12#include <linux/perf_event.h> 12#include <linux/perf_event.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h>
15 16
16#include "internal.h" 17#include "internal.h"
17 18
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
23
24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
30 return true;
31
32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
39
40 offset = (offset - tail) & mask;
41 head = (head - tail) & mask;
42
43 if ((int)(head - offset) < 0)
44 return false;
45
46 return true;
47}
48
49static void perf_output_wakeup(struct perf_output_handle *handle) 19static void perf_output_wakeup(struct perf_output_handle *handle)
50{ 20{
51 atomic_set(&handle->rb->poll, POLL_IN); 21 atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,15 +57,36 @@ again:
87 goto out; 57 goto out;
88 58
89 /* 59 /*
90 * Publish the known good head. Rely on the full barrier implied 60 * Since the mmap() consumer (userspace) can run on a different CPU:
91 * by atomic_dec_and_test() order the rb->head read and this 61 *
92 * write. 62 * kernel user
63 *
64 * READ ->data_tail READ ->data_head
65 * smp_mb() (A) smp_rmb() (C)
66 * WRITE $data READ $data
67 * smp_wmb() (B) smp_mb() (D)
68 * STORE ->data_head WRITE ->data_tail
69 *
70 * Where A pairs with D, and B pairs with C.
71 *
72 * I don't think A needs to be a full barrier because we won't in fact
73 * write data until we see the store from userspace. So we simply don't
74 * issue the data WRITE until we observe it. Be conservative for now.
75 *
76 * OTOH, D needs to be a full barrier since it separates the data READ
77 * from the tail WRITE.
78 *
79 * For B a WMB is sufficient since it separates two WRITEs, and for C
80 * an RMB is sufficient since it separates two READs.
81 *
82 * See perf_output_begin().
93 */ 83 */
84 smp_wmb();
94 rb->user_page->data_head = head; 85 rb->user_page->data_head = head;
95 86
96 /* 87 /*
97 * Now check if we missed an update, rely on the (compiler) 88 * Now check if we missed an update -- rely on previous implied
98 * barrier in atomic_dec_and_test() to re-read rb->head. 89 * compiler barriers to force a re-read.
99 */ 90 */
100 if (unlikely(head != local_read(&rb->head))) { 91 if (unlikely(head != local_read(&rb->head))) {
101 local_inc(&rb->nest); 92 local_inc(&rb->nest);
@@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
114{ 105{
115 struct ring_buffer *rb; 106 struct ring_buffer *rb;
116 unsigned long tail, offset, head; 107 unsigned long tail, offset, head;
117 int have_lost; 108 int have_lost, page_shift;
118 struct perf_sample_data sample_data;
119 struct { 109 struct {
120 struct perf_event_header header; 110 struct perf_event_header header;
121 u64 id; 111 u64 id;
@@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
130 event = event->parent; 120 event = event->parent;
131 121
132 rb = rcu_dereference(event->rb); 122 rb = rcu_dereference(event->rb);
133 if (!rb) 123 if (unlikely(!rb))
134 goto out; 124 goto out;
135 125
136 handle->rb = rb; 126 if (unlikely(!rb->nr_pages))
137 handle->event = event;
138
139 if (!rb->nr_pages)
140 goto out; 127 goto out;
141 128
129 handle->rb = rb;
130 handle->event = event;
131
142 have_lost = local_read(&rb->lost); 132 have_lost = local_read(&rb->lost);
143 if (have_lost) { 133 if (unlikely(have_lost)) {
144 lost_event.header.size = sizeof(lost_event); 134 size += sizeof(lost_event);
145 perf_event_header__init_id(&lost_event.header, &sample_data, 135 if (event->attr.sample_id_all)
146 event); 136 size += event->id_header_size;
147 size += lost_event.header.size;
148 } 137 }
149 138
150 perf_output_get_handle(handle); 139 perf_output_get_handle(handle);
151 140
152 do { 141 do {
153 /*
154 * Userspace could choose to issue a mb() before updating the
155 * tail pointer. So that all reads will be completed before the
156 * write is issued.
157 */
158 tail = ACCESS_ONCE(rb->user_page->data_tail); 142 tail = ACCESS_ONCE(rb->user_page->data_tail);
159 smp_rmb();
160 offset = head = local_read(&rb->head); 143 offset = head = local_read(&rb->head);
161 head += size; 144 if (!rb->overwrite &&
162 if (unlikely(!perf_output_space(rb, tail, offset, head))) 145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
163 goto fail; 146 goto fail;
147 head += size;
164 } while (local_cmpxchg(&rb->head, offset, head) != offset); 148 } while (local_cmpxchg(&rb->head, offset, head) != offset);
165 149
166 if (head - local_read(&rb->wakeup) > rb->watermark) 150 /*
151 * Separate the userpage->tail read from the data stores below.
152 * Matches the MB userspace SHOULD issue after reading the data
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */
157 smp_mb();
158
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
167 local_add(rb->watermark, &rb->wakeup); 160 local_add(rb->watermark, &rb->wakeup);
168 161
169 handle->page = offset >> (PAGE_SHIFT + page_order(rb)); 162 page_shift = PAGE_SHIFT + page_order(rb);
170 handle->page &= rb->nr_pages - 1;
171 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
172 handle->addr = rb->data_pages[handle->page];
173 handle->addr += handle->size;
174 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
175 163
176 if (have_lost) { 164 handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
165 offset &= (1UL << page_shift) - 1;
166 handle->addr = rb->data_pages[handle->page] + offset;
167 handle->size = (1UL << page_shift) - offset;
168
169 if (unlikely(have_lost)) {
170 struct perf_sample_data sample_data;
171
172 lost_event.header.size = sizeof(lost_event);
177 lost_event.header.type = PERF_RECORD_LOST; 173 lost_event.header.type = PERF_RECORD_LOST;
178 lost_event.header.misc = 0; 174 lost_event.header.misc = 0;
179 lost_event.id = event->id; 175 lost_event.id = event->id;
180 lost_event.lost = local_xchg(&rb->lost, 0); 176 lost_event.lost = local_xchg(&rb->lost, 0);
181 177
178 perf_event_header__init_id(&lost_event.header,
179 &sample_data, event);
182 perf_output_put(handle, lost_event); 180 perf_output_put(handle, lost_event);
183 perf_event__output_id_sample(event, handle, &sample_data); 181 perf_event__output_id_sample(event, handle, &sample_data);
184 } 182 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f3569747d629..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
35#include <linux/kdebug.h> /* notifier mechanism */ 35#include <linux/kdebug.h> /* notifier mechanism */
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h>
38 39
39#include <linux/uprobes.h> 40#include <linux/uprobes.h>
40 41
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
244 * the architecture. If an arch has variable length instruction and the 245 * the architecture. If an arch has variable length instruction and the
245 * breakpoint instruction is not of the smallest length instruction 246 * breakpoint instruction is not of the smallest length instruction
246 * supported by that architecture then we need to modify is_trap_at_addr and 247 * supported by that architecture then we need to modify is_trap_at_addr and
247 * write_opcode accordingly. This would never be a problem for archs that 248 * uprobe_write_opcode accordingly. This would never be a problem for archs
248 * have fixed length instructions. 249 * that have fixed length instructions.
249 */ 250 */
250 251
251/* 252/*
252 * write_opcode - write the opcode at a given virtual address. 253 * uprobe_write_opcode - write the opcode at a given virtual address.
253 * @mm: the probed process address space. 254 * @mm: the probed process address space.
254 * @vaddr: the virtual address to store the opcode. 255 * @vaddr: the virtual address to store the opcode.
255 * @opcode: opcode to be written at @vaddr. 256 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
260 * For mm @mm, write the opcode at @vaddr. 261 * For mm @mm, write the opcode at @vaddr.
261 * Return 0 (success) or a negative errno. 262 * Return 0 (success) or a negative errno.
262 */ 263 */
263static int write_opcode(struct mm_struct *mm, unsigned long vaddr, 264int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
264 uprobe_opcode_t opcode) 265 uprobe_opcode_t opcode)
265{ 266{
266 struct page *old_page, *new_page; 267 struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
314 */ 315 */
315int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 316int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
316{ 317{
317 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); 318 return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
318} 319}
319 320
320/** 321/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
329int __weak 330int __weak
330set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
331{ 332{
332 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
333} 334}
334 335
335static int match_uprobe(struct uprobe *l, struct uprobe *r) 336static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
503 return ret; 504 return ret;
504} 505}
505 506
506static int 507static int __copy_insn(struct address_space *mapping, struct file *filp,
507__copy_insn(struct address_space *mapping, struct file *filp, char *insn, 508 void *insn, int nbytes, loff_t offset)
508 unsigned long nbytes, loff_t offset)
509{ 509{
510 struct page *page; 510 struct page *page;
511 511
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
527 527
528static int copy_insn(struct uprobe *uprobe, struct file *filp) 528static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 529{
530 struct address_space *mapping; 530 struct address_space *mapping = uprobe->inode->i_mapping;
531 unsigned long nbytes; 531 loff_t offs = uprobe->offset;
532 int bytes; 532 void *insn = uprobe->arch.insn;
533 533 int size = MAX_UINSN_BYTES;
534 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); 534 int len, err = -EIO;
535 mapping = uprobe->inode->i_mapping;
536 535
537 /* Instruction at end of binary; copy only available bytes */ 536 /* Copy only available bytes, -EIO if nothing was read */
538 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) 537 do {
539 bytes = uprobe->inode->i_size - uprobe->offset; 538 if (offs >= i_size_read(uprobe->inode))
540 else 539 break;
541 bytes = MAX_UINSN_BYTES;
542 540
543 /* Instruction at the page-boundary; copy bytes in second page */ 541 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
544 if (nbytes < bytes) { 542 err = __copy_insn(mapping, filp, insn, len, offs);
545 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
546 bytes - nbytes, uprobe->offset + nbytes);
547 if (err) 543 if (err)
548 return err; 544 break;
549 bytes = nbytes; 545
550 } 546 insn += len;
551 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 547 offs += len;
548 size -= len;
549 } while (size);
550
551 return err;
552} 552}
553 553
554static int prepare_uprobe(struct uprobe *uprobe, struct file *file, 554static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
576 if (ret) 576 if (ret)
577 goto out; 577 goto out;
578 578
579 /* write_opcode() assumes we don't cross page boundary */ 579 /* uprobe_write_opcode() assumes we don't cross page boundary */
580 BUG_ON((uprobe->offset & ~PAGE_MASK) + 580 BUG_ON((uprobe->offset & ~PAGE_MASK) +
581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); 581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
582 582
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1096} 1096}
1097 1097
1098/* Slot allocation for XOL */ 1098/* Slot allocation for XOL */
1099static int xol_add_vma(struct xol_area *area) 1099static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1100{ 1100{
1101 struct mm_struct *mm = current->mm;
1102 int ret = -EALREADY; 1101 int ret = -EALREADY;
1103 1102
1104 down_write(&mm->mmap_sem); 1103 down_write(&mm->mmap_sem);
1105 if (mm->uprobes_state.xol_area) 1104 if (mm->uprobes_state.xol_area)
1106 goto fail; 1105 goto fail;
1107 1106
1108 ret = -ENOMEM; 1107 if (!area->vaddr) {
1109 /* Try to map as high as possible, this is only a hint. */ 1108 /* Try to map as high as possible, this is only a hint. */
1110 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1109 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1111 if (area->vaddr & ~PAGE_MASK) { 1110 PAGE_SIZE, 0, 0);
1112 ret = area->vaddr; 1111 if (area->vaddr & ~PAGE_MASK) {
1113 goto fail; 1112 ret = area->vaddr;
1113 goto fail;
1114 }
1114 } 1115 }
1115 1116
1116 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, 1117 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
1120 1121
1121 smp_wmb(); /* pairs with get_xol_area() */ 1122 smp_wmb(); /* pairs with get_xol_area() */
1122 mm->uprobes_state.xol_area = area; 1123 mm->uprobes_state.xol_area = area;
1123 ret = 0;
1124 fail: 1124 fail:
1125 up_write(&mm->mmap_sem); 1125 up_write(&mm->mmap_sem);
1126 1126
1127 return ret; 1127 return ret;
1128} 1128}
1129 1129
1130/* 1130static struct xol_area *__create_xol_area(unsigned long vaddr)
1131 * get_xol_area - Allocate process's xol_area if necessary.
1132 * This area will be used for storing instructions for execution out of line.
1133 *
1134 * Returns the allocated area or NULL.
1135 */
1136static struct xol_area *get_xol_area(void)
1137{ 1131{
1138 struct mm_struct *mm = current->mm; 1132 struct mm_struct *mm = current->mm;
1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN; 1133 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1134 struct xol_area *area;
1141 1135
1142 area = mm->uprobes_state.xol_area; 1136 area = kmalloc(sizeof(*area), GFP_KERNEL);
1143 if (area)
1144 goto ret;
1145
1146 area = kzalloc(sizeof(*area), GFP_KERNEL);
1147 if (unlikely(!area)) 1137 if (unlikely(!area))
1148 goto out; 1138 goto out;
1149 1139
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
1155 if (!area->page) 1145 if (!area->page)
1156 goto free_bitmap; 1146 goto free_bitmap;
1157 1147
1158 /* allocate first slot of task's xol_area for the return probes */ 1148 area->vaddr = vaddr;
1149 init_waitqueue_head(&area->wq);
1150 /* Reserve the 1st slot for get_trampoline_vaddr() */
1159 set_bit(0, area->bitmap); 1151 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1); 1152 atomic_set(&area->slot_count, 1);
1162 init_waitqueue_head(&area->wq); 1153 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1163 1154
1164 if (!xol_add_vma(area)) 1155 if (!xol_add_vma(mm, area))
1165 return area; 1156 return area;
1166 1157
1167 __free_page(area->page); 1158 __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
1170 free_area: 1161 free_area:
1171 kfree(area); 1162 kfree(area);
1172 out: 1163 out:
1164 return NULL;
1165}
1166
1167/*
1168 * get_xol_area - Allocate process's xol_area if necessary.
1169 * This area will be used for storing instructions for execution out of line.
1170 *
1171 * Returns the allocated area or NULL.
1172 */
1173static struct xol_area *get_xol_area(void)
1174{
1175 struct mm_struct *mm = current->mm;
1176 struct xol_area *area;
1177
1178 if (!mm->uprobes_state.xol_area)
1179 __create_xol_area(0);
1180
1173 area = mm->uprobes_state.xol_area; 1181 area = mm->uprobes_state.xol_area;
1174 ret: 1182 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1175 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1176 return area; 1183 return area;
1177} 1184}
1178 1185
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1256 return 0; 1263 return 0;
1257 1264
1258 /* Initialize the slot */ 1265 /* Initialize the slot */
1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); 1266 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1260 /* 1268 /*
1261 * We probably need flush_icache_user_range() but it needs vma. 1269 * We probably need flush_icache_user_range() but it needs vma.
1262 * This should work on supported architectures too. 1270 * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
1345} 1353}
1346 1354
1347/* 1355/*
1348 * Called in context of a new clone/fork from copy_process.
1349 */
1350void uprobe_copy_process(struct task_struct *t)
1351{
1352 t->utask = NULL;
1353}
1354
1355/*
1356 * Allocate a uprobe_task object for the task if if necessary. 1356 * Allocate a uprobe_task object for the task if if necessary.
1357 * Called when the thread hits a breakpoint. 1357 * Called when the thread hits a breakpoint.
1358 * 1358 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
1367 return current->utask; 1367 return current->utask;
1368} 1368}
1369 1369
1370static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1371{
1372 struct uprobe_task *n_utask;
1373 struct return_instance **p, *o, *n;
1374
1375 n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1376 if (!n_utask)
1377 return -ENOMEM;
1378 t->utask = n_utask;
1379
1380 p = &n_utask->return_instances;
1381 for (o = o_utask->return_instances; o; o = o->next) {
1382 n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1383 if (!n)
1384 return -ENOMEM;
1385
1386 *n = *o;
1387 atomic_inc(&n->uprobe->ref);
1388 n->next = NULL;
1389
1390 *p = n;
1391 p = &n->next;
1392 n_utask->depth++;
1393 }
1394
1395 return 0;
1396}
1397
1398static void uprobe_warn(struct task_struct *t, const char *msg)
1399{
1400 pr_warn("uprobe: %s:%d failed to %s\n",
1401 current->comm, current->pid, msg);
1402}
1403
1404static void dup_xol_work(struct callback_head *work)
1405{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING)
1409 return;
1410
1411 if (!__create_xol_area(current->utask->vaddr))
1412 uprobe_warn(current, "dup xol area");
1413}
1414
1415/*
1416 * Called in context of a new clone/fork from copy_process.
1417 */
1418void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{
1420 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area;
1424
1425 t->utask = NULL;
1426
1427 if (!utask || !utask->return_instances)
1428 return;
1429
1430 if (mm == t->mm && !(flags & CLONE_VFORK))
1431 return;
1432
1433 if (dup_utask(t, utask))
1434 return uprobe_warn(t, "dup ret instances");
1435
1436 /* The task can fork() after dup_xol_work() fails */
1437 area = mm->uprobes_state.xol_area;
1438 if (!area)
1439 return uprobe_warn(t, "dup xol area");
1440
1441 if (mm == t->mm)
1442 return;
1443
1444 /* TODO: move it into the union in uprobe_task */
1445 work = kmalloc(sizeof(*work), GFP_KERNEL);
1446 if (!work)
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452}
1453
1370/* 1454/*
1371 * Current area->vaddr notion assume the trampoline address is always 1455 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr. 1456 * equal area->vaddr.
@@ -1682,12 +1766,10 @@ static bool handle_trampoline(struct pt_regs *regs)
1682 tmp = ri; 1766 tmp = ri;
1683 ri = ri->next; 1767 ri = ri->next;
1684 kfree(tmp); 1768 kfree(tmp);
1769 utask->depth--;
1685 1770
1686 if (!chained) 1771 if (!chained)
1687 break; 1772 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri); 1773 BUG_ON(!ri);
1692 } 1774 }
1693 1775
@@ -1859,9 +1941,4 @@ static int __init init_uprobes(void)
1859 1941
1860 return register_die_notifier(&uprobe_exception_nb); 1942 return register_die_notifier(&uprobe_exception_nb);
1861} 1943}
1862module_init(init_uprobes); 1944__initcall(init_uprobes);
1863
1864static void __exit exit_uprobes(void)
1865{
1866}
1867module_exit(exit_uprobes);
diff --git a/kernel/extable.c b/kernel/extable.c
index 67460b93b1a1..832cb28105bb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) { 44 if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
45 pr_notice("Sorting __ex_table...\n"); 45 pr_notice("Sorting __ex_table...\n");
46 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
47 } 47 }
diff --git a/kernel/fork.c b/kernel/fork.c
index bf46287c91a4..728d5be9548c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
351 struct rb_node **rb_link, *rb_parent; 351 struct rb_node **rb_link, *rb_parent;
352 int retval; 352 int retval;
353 unsigned long charge; 353 unsigned long charge;
354 struct mempolicy *pol;
355 354
356 uprobe_start_dup_mmap(); 355 uprobe_start_dup_mmap();
357 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
400 goto fail_nomem; 399 goto fail_nomem;
401 *tmp = *mpnt; 400 *tmp = *mpnt;
402 INIT_LIST_HEAD(&tmp->anon_vma_chain); 401 INIT_LIST_HEAD(&tmp->anon_vma_chain);
403 pol = mpol_dup(vma_policy(mpnt)); 402 retval = vma_dup_policy(mpnt, tmp);
404 retval = PTR_ERR(pol); 403 if (retval)
405 if (IS_ERR(pol))
406 goto fail_nomem_policy; 404 goto fail_nomem_policy;
407 vma_set_policy(tmp, pol);
408 tmp->vm_mm = mm; 405 tmp->vm_mm = mm;
409 if (anon_vma_fork(tmp, mpnt)) 406 if (anon_vma_fork(tmp, mpnt))
410 goto fail_nomem_anon_vma_fork; 407 goto fail_nomem_anon_vma_fork;
@@ -472,7 +469,7 @@ out:
472 uprobe_end_dup_mmap(); 469 uprobe_end_dup_mmap();
473 return retval; 470 return retval;
474fail_nomem_anon_vma_fork: 471fail_nomem_anon_vma_fork:
475 mpol_put(pol); 472 mpol_put(vma_policy(tmp));
476fail_nomem_policy: 473fail_nomem_policy:
477 kmem_cache_free(vm_area_cachep, tmp); 474 kmem_cache_free(vm_area_cachep, tmp);
478fail_nomem: 475fail_nomem:
@@ -522,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm)
522{ 519{
523#ifdef CONFIG_AIO 520#ifdef CONFIG_AIO
524 spin_lock_init(&mm->ioctx_lock); 521 spin_lock_init(&mm->ioctx_lock);
525 INIT_HLIST_HEAD(&mm->ioctx_list); 522 mm->ioctx_table = NULL;
526#endif 523#endif
527} 524}
528 525
@@ -535,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
535 mm->flags = (current->mm) ? 532 mm->flags = (current->mm) ?
536 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 533 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
537 mm->core_state = NULL; 534 mm->core_state = NULL;
538 mm->nr_ptes = 0; 535 atomic_long_set(&mm->nr_ptes, 0);
539 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 536 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
540 spin_lock_init(&mm->page_table_lock); 537 spin_lock_init(&mm->page_table_lock);
541 mm_init_aio(mm); 538 mm_init_aio(mm);
@@ -563,7 +560,7 @@ static void check_mm(struct mm_struct *mm)
563 "mm:%p idx:%d val:%ld\n", mm, i, x); 560 "mm:%p idx:%d val:%ld\n", mm, i, x);
564 } 561 }
565 562
566#ifdef CONFIG_TRANSPARENT_HUGEPAGE 563#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
567 VM_BUG_ON(mm->pmd_huge_pte); 564 VM_BUG_ON(mm->pmd_huge_pte);
568#endif 565#endif
569} 566}
@@ -817,12 +814,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
817 memcpy(mm, oldmm, sizeof(*mm)); 814 memcpy(mm, oldmm, sizeof(*mm));
818 mm_init_cpumask(mm); 815 mm_init_cpumask(mm);
819 816
820#ifdef CONFIG_TRANSPARENT_HUGEPAGE 817#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
821 mm->pmd_huge_pte = NULL; 818 mm->pmd_huge_pte = NULL;
822#endif 819#endif
823#ifdef CONFIG_NUMA_BALANCING
824 mm->first_nid = NUMA_PTE_SCAN_INIT;
825#endif
826 if (!mm_init(mm, tsk)) 820 if (!mm_init(mm, tsk))
827 goto fail_nomem; 821 goto fail_nomem;
828 822
@@ -1173,13 +1167,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1173 return ERR_PTR(-EINVAL); 1167 return ERR_PTR(-EINVAL);
1174 1168
1175 /* 1169 /*
1176 * If the new process will be in a different pid namespace 1170 * If the new process will be in a different pid or user namespace
1177 * don't allow the creation of threads. 1171 * do not allow it to share a thread group or signal handlers or
1172 * parent with the forking task.
1178 */ 1173 */
1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && 1174 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
1180 (task_active_pid_ns(current) != 1175 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1181 current->nsproxy->pid_ns_for_children)) 1176 (task_active_pid_ns(current) !=
1182 return ERR_PTR(-EINVAL); 1177 current->nsproxy->pid_ns_for_children))
1178 return ERR_PTR(-EINVAL);
1179 }
1183 1180
1184 retval = security_task_create(clone_flags); 1181 retval = security_task_create(clone_flags);
1185 if (retval) 1182 if (retval)
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313#endif 1310#endif
1314 1311
1315 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1316 sched_fork(p); 1313 sched_fork(clone_flags, p);
1317 1314
1318 retval = perf_event_init_task(p); 1315 retval = perf_event_init_task(p);
1319 if (retval) 1316 if (retval)
@@ -1373,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 INIT_LIST_HEAD(&p->pi_state_list); 1370 INIT_LIST_HEAD(&p->pi_state_list);
1374 p->pi_state_cache = NULL; 1371 p->pi_state_cache = NULL;
1375#endif 1372#endif
1376 uprobe_copy_process(p);
1377 /* 1373 /*
1378 * sigaltstack should be cleared when sharing the same VM 1374 * sigaltstack should be cleared when sharing the same VM
1379 */ 1375 */
@@ -1490,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1490 perf_event_fork(p); 1486 perf_event_fork(p);
1491 1487
1492 trace_task_newtask(p, clone_flags); 1488 trace_task_newtask(p, clone_flags);
1489 uprobe_copy_process(p, clone_flags);
1493 1490
1494 return p; 1491 return p;
1495 1492
@@ -1576,15 +1573,6 @@ long do_fork(unsigned long clone_flags,
1576 long nr; 1573 long nr;
1577 1574
1578 /* 1575 /*
1579 * Do some preliminary argument and permissions checking before we
1580 * actually start allocating stuff
1581 */
1582 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1583 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1584 return -EINVAL;
1585 }
1586
1587 /*
1588 * Determine whether and which event to report to ptracer. When 1576 * Determine whether and which event to report to ptracer. When
1589 * called from kernel_thread or CLONE_UNTRACED is explicitly 1577 * called from kernel_thread or CLONE_UNTRACED is explicitly
1590 * requested, no event is reported; otherwise, report if the event 1578 * requested, no event is reported; otherwise, report if the event
@@ -1825,11 +1813,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1825 if (unshare_flags & CLONE_NEWUSER) 1813 if (unshare_flags & CLONE_NEWUSER)
1826 unshare_flags |= CLONE_THREAD | CLONE_FS; 1814 unshare_flags |= CLONE_THREAD | CLONE_FS;
1827 /* 1815 /*
1828 * If unsharing a pid namespace must also unshare the thread.
1829 */
1830 if (unshare_flags & CLONE_NEWPID)
1831 unshare_flags |= CLONE_THREAD;
1832 /*
1833 * If unsharing a thread from a thread group, must also unshare vm. 1816 * If unsharing a thread from a thread group, must also unshare vm.
1834 */ 1817 */
1835 if (unshare_flags & CLONE_THREAD) 1818 if (unshare_flags & CLONE_THREAD)
diff --git a/kernel/futex.c b/kernel/futex.c
index c3a1a55a5214..80ba086f021d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -66,7 +66,7 @@
66 66
67#include <asm/futex.h> 67#include <asm/futex.h>
68 68
69#include "rtmutex_common.h" 69#include "locking/rtmutex_common.h"
70 70
71int __read_mostly futex_cmpxchg_enabled; 71int __read_mostly futex_cmpxchg_enabled;
72 72
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d4da55d1fb65..d04ce8ac4399 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL
46 larger and run slower. Also be sure to exclude files from profiling 46 larger and run slower. Also be sure to exclude files from profiling
47 which are not linked to the kernel image to prevent linker errors. 47 which are not linked to the kernel image to prevent linker errors.
48 48
49choice
50 prompt "Specify GCOV format"
51 depends on GCOV_KERNEL
52 default GCOV_FORMAT_AUTODETECT
53 ---help---
54 The gcov format is usually determined by the GCC version, but there are
55 exceptions where format changes are integrated in lower-version GCCs.
56 In such a case use this option to adjust the format used in the kernel
57 accordingly.
58
59 If unsure, choose "Autodetect".
60
61config GCOV_FORMAT_AUTODETECT
62 bool "Autodetect"
63 ---help---
64 Select this option to use the format that corresponds to your GCC
65 version.
66
67config GCOV_FORMAT_3_4
68 bool "GCC 3.4 format"
69 ---help---
70 Select this option to use the format defined by GCC 3.4.
71
72config GCOV_FORMAT_4_7
73 bool "GCC 4.7 format"
74 ---help---
75 Select this option to use the format defined by GCC 4.7.
76
77endchoice
78
49endmenu 79endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index e97ca59e2520..52aa7e8de927 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,33 @@
1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3# if-lt
4# Usage VAR := $(call if-lt, $(a), $(b))
5# Returns 1 if (a < b)
6if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
7
8ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
9 cc-ver := 0304
10else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
11 cc-ver := 0407
12else
13# Use cc-version if available, otherwise set 0
14#
15# scripts/Kbuild.include, which contains cc-version function, is not included
16# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
17# Meaning cc-ver is empty causing if-lt test to fail with
18# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
19# This has no affect on the clean phase, but the error message could be
20# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
21# is not available. We can probably move if-lt to Kbuild.include, so it's also
22# not defined during clean or to include Kbuild.include in
23# scripts/Makefile.clean. But the following workaround seems least invasive.
24 cc-ver := $(if $(call cc-version),$(call cc-version),0)
25endif
26
27obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
28
29ifeq ($(call if-lt, $(cc-ver), 0407),1)
30 obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
31else
32 obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
33endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9b22d03cc581..f45b75b713c0 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,7 +20,6 @@
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include "gcov.h" 21#include "gcov.h"
22 22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled; 23static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock); 24static DEFINE_MUTEX(gcov_lock);
26 25
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)
34 33
35 mutex_lock(&gcov_lock); 34 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) { 35 if (gcov_version == 0) {
37 gcov_version = info->version; 36 gcov_version = gcov_info_version(info);
38 /* 37 /*
39 * Printing gcc's version magic may prove useful for debugging 38 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports. 39 * incompatibility reports.
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)
45 * Add new profiling data structure to list and inform event 44 * Add new profiling data structure to list and inform event
46 * listener. 45 * listener.
47 */ 46 */
48 info->next = gcov_info_head; 47 gcov_info_link(info);
49 gcov_info_head = info;
50 if (gcov_events_enabled) 48 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info); 49 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock); 50 mutex_unlock(&gcov_lock);
@@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
81} 79}
82EXPORT_SYMBOL(__gcov_merge_delta); 80EXPORT_SYMBOL(__gcov_merge_delta);
83 81
82void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
83{
84 /* Unused. */
85}
86EXPORT_SYMBOL(__gcov_merge_ior);
87
84/** 88/**
85 * gcov_enable_events - enable event reporting through gcov_event() 89 * gcov_enable_events - enable event reporting through gcov_event()
86 * 90 *
@@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);
91 */ 95 */
92void gcov_enable_events(void) 96void gcov_enable_events(void)
93{ 97{
94 struct gcov_info *info; 98 struct gcov_info *info = NULL;
95 99
96 mutex_lock(&gcov_lock); 100 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1; 101 gcov_events_enabled = 1;
102
98 /* Perform event callback for previously registered entries. */ 103 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next) 104 while ((info = gcov_info_next(info)))
100 gcov_event(GCOV_ADD, info); 105 gcov_event(GCOV_ADD, info);
106
101 mutex_unlock(&gcov_lock); 107 mutex_unlock(&gcov_lock);
102} 108}
103 109
@@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data) 118 void *data)
113{ 119{
114 struct module *mod = data; 120 struct module *mod = data;
115 struct gcov_info *info; 121 struct gcov_info *info = NULL;
116 struct gcov_info *prev; 122 struct gcov_info *prev = NULL;
117 123
118 if (event != MODULE_STATE_GOING) 124 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK; 125 return NOTIFY_OK;
120 mutex_lock(&gcov_lock); 126 mutex_lock(&gcov_lock);
121 prev = NULL; 127
122 /* Remove entries located in module from linked list. */ 128 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) { 129 while ((info = gcov_info_next(info))) {
124 if (within(info, mod->module_core, mod->core_size)) { 130 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev) 131 gcov_info_unlink(prev, info);
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled) 132 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info); 133 gcov_event(GCOV_REMOVE, info);
131 } else 134 } else
132 prev = info; 135 prev = info;
133 } 136 }
137
134 mutex_unlock(&gcov_lock); 138 mutex_unlock(&gcov_lock);
135 139
136 return NOTIFY_OK; 140 return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 9bd0934f6c33..15ff01a76379 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -74,8 +74,8 @@ static int __init gcov_persist_setup(char *str)
74{ 74{
75 unsigned long val; 75 unsigned long val;
76 76
77 if (strict_strtoul(str, 0, &val)) { 77 if (kstrtoul(str, 0, &val)) {
78 pr_warning("invalid gcov_persist parameter '%s'\n", str); 78 pr_warn("invalid gcov_persist parameter '%s'\n", str);
79 return 0; 79 return 0;
80 } 80 }
81 gcov_persist = val; 81 gcov_persist = val;
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)
242 242
243 list_for_each_entry(node, &all_head, all) { 243 list_for_each_entry(node, &all_head, all) {
244 info = get_node_info(node); 244 info = get_node_info(node);
245 if (info && (strcmp(info->filename, name) == 0)) 245 if (info && (strcmp(gcov_info_filename(info), name) == 0))
246 return node; 246 return node;
247 } 247 }
248 248
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
279 seq = file->private_data; 279 seq = file->private_data;
280 info = gcov_iter_get_info(seq->private); 280 info = gcov_iter_get_info(seq->private);
281 mutex_lock(&node_lock); 281 mutex_lock(&node_lock);
282 node = get_node_by_name(info->filename); 282 node = get_node_by_name(gcov_info_filename(info));
283 if (node) { 283 if (node) {
284 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
285 if (node->num_loaded == 0) 285 if (node->num_loaded == 0)
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename)
365 */ 365 */
366static void add_links(struct gcov_node *node, struct dentry *parent) 366static void add_links(struct gcov_node *node, struct dentry *parent)
367{ 367{
368 char *basename; 368 const char *basename;
369 char *target; 369 char *target;
370 int num; 370 int num;
371 int i; 371 int i;
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
376 if (!node->links) 376 if (!node->links)
377 return; 377 return;
378 for (i = 0; i < num; i++) { 378 for (i = 0; i < num; i++) {
379 target = get_link_target(get_node_info(node)->filename, 379 target = get_link_target(
380 &gcov_link[i]); 380 gcov_info_filename(get_node_info(node)),
381 &gcov_link[i]);
381 if (!target) 382 if (!target)
382 goto out_err; 383 goto out_err;
383 basename = strrchr(target, '/'); 384 basename = kbasename(target);
384 if (!basename) 385 if (basename == target)
385 goto out_err; 386 goto out_err;
386 basename++;
387 node->links[i] = debugfs_create_symlink(deskew(basename), 387 node->links[i] = debugfs_create_symlink(deskew(basename),
388 parent, target); 388 parent, target);
389 if (!node->links[i]) 389 if (!node->links[i])
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
450 } else 450 } else
451 node->dentry = debugfs_create_dir(node->name, parent->dentry); 451 node->dentry = debugfs_create_dir(node->name, parent->dentry);
452 if (!node->dentry) { 452 if (!node->dentry) {
453 pr_warning("could not create file\n"); 453 pr_warn("could not create file\n");
454 kfree(node); 454 kfree(node);
455 return NULL; 455 return NULL;
456 } 456 }
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
463 463
464err_nomem: 464err_nomem:
465 kfree(node); 465 kfree(node);
466 pr_warning("out of memory\n"); 466 pr_warn("out of memory\n");
467 return NULL; 467 return NULL;
468} 468}
469 469
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)
576 struct gcov_node *parent; 576 struct gcov_node *parent;
577 struct gcov_node *node; 577 struct gcov_node *node;
578 578
579 filename = kstrdup(info->filename, GFP_KERNEL); 579 filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
580 if (!filename) 580 if (!filename)
581 return; 581 return;
582 parent = &root_node; 582 parent = &root_node;
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
630 */ 630 */
631 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); 631 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
632 if (!loaded_info) { 632 if (!loaded_info) {
633 pr_warning("could not add '%s' (out of memory)\n", 633 pr_warn("could not add '%s' (out of memory)\n",
634 info->filename); 634 gcov_info_filename(info));
635 return; 635 return;
636 } 636 }
637 memcpy(loaded_info, node->loaded_info, 637 memcpy(loaded_info, node->loaded_info,
@@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
644 * data set replaces the copy of the last one. 644 * data set replaces the copy of the last one.
645 */ 645 */
646 if (!gcov_info_is_compatible(node->unloaded_info, info)) { 646 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
647 pr_warning("discarding saved data for %s " 647 pr_warn("discarding saved data for %s "
648 "(incompatible version)\n", info->filename); 648 "(incompatible version)\n",
649 gcov_info_filename(info));
649 gcov_info_free(node->unloaded_info); 650 gcov_info_free(node->unloaded_info);
650 node->unloaded_info = NULL; 651 node->unloaded_info = NULL;
651 } 652 }
@@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
655 * The initial one takes precedence. 656 * The initial one takes precedence.
656 */ 657 */
657 if (!gcov_info_is_compatible(node->loaded_info[0], info)) { 658 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
658 pr_warning("could not add '%s' (incompatible " 659 pr_warn("could not add '%s' (incompatible "
659 "version)\n", info->filename); 660 "version)\n", gcov_info_filename(info));
660 kfree(loaded_info); 661 kfree(loaded_info);
661 return; 662 return;
662 } 663 }
@@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)
691 else { 692 else {
692 node->unloaded_info = gcov_info_dup(info); 693 node->unloaded_info = gcov_info_dup(info);
693 if (!node->unloaded_info) { 694 if (!node->unloaded_info) {
694 pr_warning("could not save data for '%s' " 695 pr_warn("could not save data for '%s' "
695 "(out of memory)\n", info->filename); 696 "(out of memory)\n",
697 gcov_info_filename(info));
696 } 698 }
697 } 699 }
698} 700}
@@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)
707 709
708 i = get_info_index(node, info); 710 i = get_info_index(node, info);
709 if (i < 0) { 711 if (i < 0) {
710 pr_warning("could not remove '%s' (not found)\n", 712 pr_warn("could not remove '%s' (not found)\n",
711 info->filename); 713 gcov_info_filename(info));
712 return; 714 return;
713 } 715 }
714 if (gcov_persist) 716 if (gcov_persist)
@@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
735 struct gcov_node *node; 737 struct gcov_node *node;
736 738
737 mutex_lock(&node_lock); 739 mutex_lock(&node_lock);
738 node = get_node_by_name(info->filename); 740 node = get_node_by_name(gcov_info_filename(info));
739 switch (action) { 741 switch (action) {
740 case GCOV_ADD: 742 case GCOV_ADD:
741 if (node) 743 if (node)
@@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
747 if (node) 749 if (node)
748 remove_info(node, info); 750 remove_info(node, info);
749 else { 751 else {
750 pr_warning("could not remove '%s' (not found)\n", 752 pr_warn("could not remove '%s' (not found)\n",
751 info->filename); 753 gcov_info_filename(info));
752 } 754 }
753 break; 755 break;
754 } 756 }
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb4260033..27bc88a35013 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,6 +21,121 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include "gcov.h" 22#include "gcov.h"
23 23
24#define GCOV_COUNTERS 5
25
26static struct gcov_info *gcov_info_head;
27
28/**
29 * struct gcov_fn_info - profiling meta data per function
30 * @ident: object file-unique function identifier
31 * @checksum: function checksum
32 * @n_ctrs: number of values per counter type belonging to this function
33 *
34 * This data is generated by gcc during compilation and doesn't change
35 * at run-time.
36 */
37struct gcov_fn_info {
38 unsigned int ident;
39 unsigned int checksum;
40 unsigned int n_ctrs[0];
41};
42
43/**
44 * struct gcov_ctr_info - profiling data per counter type
45 * @num: number of counter values for this type
46 * @values: array of counter values for this type
47 * @merge: merge function for counter values of this type (unused)
48 *
49 * This data is generated by gcc during compilation and doesn't change
50 * at run-time with the exception of the values array.
51 */
52struct gcov_ctr_info {
53 unsigned int num;
54 gcov_type *values;
55 void (*merge)(gcov_type *, unsigned int);
56};
57
58/**
59 * struct gcov_info - profiling data per object file
60 * @version: gcov version magic indicating the gcc version used for compilation
61 * @next: list head for a singly-linked list
62 * @stamp: time stamp
63 * @filename: name of the associated gcov data file
64 * @n_functions: number of instrumented functions
65 * @functions: function data
66 * @ctr_mask: mask specifying which counter types are active
67 * @counts: counter data per counter type
68 *
69 * This data is generated by gcc during compilation and doesn't change
70 * at run-time with the exception of the next pointer.
71 */
72struct gcov_info {
73 unsigned int version;
74 struct gcov_info *next;
75 unsigned int stamp;
76 const char *filename;
77 unsigned int n_functions;
78 const struct gcov_fn_info *functions;
79 unsigned int ctr_mask;
80 struct gcov_ctr_info counts[0];
81};
82
83/**
84 * gcov_info_filename - return info filename
85 * @info: profiling data set
86 */
87const char *gcov_info_filename(struct gcov_info *info)
88{
89 return info->filename;
90}
91
92/**
93 * gcov_info_version - return info version
94 * @info: profiling data set
95 */
96unsigned int gcov_info_version(struct gcov_info *info)
97{
98 return info->version;
99}
100
101/**
102 * gcov_info_next - return next profiling data set
103 * @info: profiling data set
104 *
105 * Returns next gcov_info following @info or first gcov_info in the chain if
106 * @info is %NULL.
107 */
108struct gcov_info *gcov_info_next(struct gcov_info *info)
109{
110 if (!info)
111 return gcov_info_head;
112
113 return info->next;
114}
115
116/**
117 * gcov_info_link - link/add profiling data set to the list
118 * @info: profiling data set
119 */
120void gcov_info_link(struct gcov_info *info)
121{
122 info->next = gcov_info_head;
123 gcov_info_head = info;
124}
125
126/**
127 * gcov_info_unlink - unlink/remove profiling data set from the list
128 * @prev: previous profiling data set
129 * @info: profiling data set
130 */
131void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
132{
133 if (prev)
134 prev->next = info->next;
135 else
136 gcov_info_head = info->next;
137}
138
24/* Symbolic links to be created for each profiling data file. */ 139/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = { 140const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ 141 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 000000000000..2c6e4631c814
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,560 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 4.7.
4 *
5 * This file is based heavily on gcc_3_4.c file.
6 *
7 * For a better understanding, refer to gcc source:
8 * gcc/gcov-io.h
9 * libgcc/libgcov.c
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/string.h>
17#include <linux/seq_file.h>
18#include <linux/vmalloc.h>
19#include "gcov.h"
20
21#define GCOV_COUNTERS 8
22#define GCOV_TAG_FUNCTION_LENGTH 3
23
24static struct gcov_info *gcov_info_head;
25
26/**
27 * struct gcov_ctr_info - information about counters for a single function
28 * @num: number of counter values for this type
29 * @values: array of counter values for this type
30 *
31 * This data is generated by gcc during compilation and doesn't change
32 * at run-time with the exception of the values array.
33 */
34struct gcov_ctr_info {
35 unsigned int num;
36 gcov_type *values;
37};
38
39/**
40 * struct gcov_fn_info - profiling meta data per function
41 * @key: comdat key
42 * @ident: unique ident of function
43 * @lineno_checksum: function lineo_checksum
44 * @cfg_checksum: function cfg checksum
45 * @ctrs: instrumented counters
46 *
47 * This data is generated by gcc during compilation and doesn't change
48 * at run-time.
49 *
50 * Information about a single function. This uses the trailing array
51 * idiom. The number of counters is determined from the merge pointer
52 * array in gcov_info. The key is used to detect which of a set of
53 * comdat functions was selected -- it points to the gcov_info object
54 * of the object file containing the selected comdat function.
55 */
56struct gcov_fn_info {
57 const struct gcov_info *key;
58 unsigned int ident;
59 unsigned int lineno_checksum;
60 unsigned int cfg_checksum;
61 struct gcov_ctr_info ctrs[0];
62};
63
64/**
65 * struct gcov_info - profiling data per object file
66 * @version: gcov version magic indicating the gcc version used for compilation
67 * @next: list head for a singly-linked list
68 * @stamp: uniquifying time stamp
69 * @filename: name of the associated gcov data file
70 * @merge: merge functions (null for unused counter type)
71 * @n_functions: number of instrumented functions
72 * @functions: pointer to pointers to function information
73 *
74 * This data is generated by gcc during compilation and doesn't change
75 * at run-time with the exception of the next pointer.
76 */
77struct gcov_info {
78 unsigned int version;
79 struct gcov_info *next;
80 unsigned int stamp;
81 const char *filename;
82 void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
83 unsigned int n_functions;
84 struct gcov_fn_info **functions;
85};
86
87/**
88 * gcov_info_filename - return info filename
89 * @info: profiling data set
90 */
91const char *gcov_info_filename(struct gcov_info *info)
92{
93 return info->filename;
94}
95
96/**
97 * gcov_info_version - return info version
98 * @info: profiling data set
99 */
100unsigned int gcov_info_version(struct gcov_info *info)
101{
102 return info->version;
103}
104
105/**
106 * gcov_info_next - return next profiling data set
107 * @info: profiling data set
108 *
109 * Returns next gcov_info following @info or first gcov_info in the chain if
110 * @info is %NULL.
111 */
112struct gcov_info *gcov_info_next(struct gcov_info *info)
113{
114 if (!info)
115 return gcov_info_head;
116
117 return info->next;
118}
119
120/**
121 * gcov_info_link - link/add profiling data set to the list
122 * @info: profiling data set
123 */
124void gcov_info_link(struct gcov_info *info)
125{
126 info->next = gcov_info_head;
127 gcov_info_head = info;
128}
129
130/**
131 * gcov_info_unlink - unlink/remove profiling data set from the list
132 * @prev: previous profiling data set
133 * @info: profiling data set
134 */
135void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
136{
137 if (prev)
138 prev->next = info->next;
139 else
140 gcov_info_head = info->next;
141}
142
143/* Symbolic links to be created for each profiling data file. */
144const struct gcov_link gcov_link[] = {
145 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
146 { 0, NULL},
147};
148
149/*
150 * Determine whether a counter is active. Doesn't change at run-time.
151 */
152static int counter_active(struct gcov_info *info, unsigned int type)
153{
154 return info->merge[type] ? 1 : 0;
155}
156
157/* Determine number of active counters. Based on gcc magic. */
158static unsigned int num_counter_active(struct gcov_info *info)
159{
160 unsigned int i;
161 unsigned int result = 0;
162
163 for (i = 0; i < GCOV_COUNTERS; i++) {
164 if (counter_active(info, i))
165 result++;
166 }
167 return result;
168}
169
170/**
171 * gcov_info_reset - reset profiling data to zero
172 * @info: profiling data set
173 */
174void gcov_info_reset(struct gcov_info *info)
175{
176 struct gcov_ctr_info *ci_ptr;
177 unsigned int fi_idx;
178 unsigned int ct_idx;
179
180 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
181 ci_ptr = info->functions[fi_idx]->ctrs;
182
183 for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
184 if (!counter_active(info, ct_idx))
185 continue;
186
187 memset(ci_ptr->values, 0,
188 sizeof(gcov_type) * ci_ptr->num);
189 ci_ptr++;
190 }
191 }
192}
193
194/**
195 * gcov_info_is_compatible - check if profiling data can be added
196 * @info1: first profiling data set
197 * @info2: second profiling data set
198 *
199 * Returns non-zero if profiling data can be added, zero otherwise.
200 */
201int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
202{
203 return (info1->stamp == info2->stamp);
204}
205
206/**
207 * gcov_info_add - add up profiling data
208 * @dest: profiling data set to which data is added
209 * @source: profiling data set which is added
210 *
211 * Adds profiling counts of @source to @dest.
212 */
213void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
214{
215 struct gcov_ctr_info *dci_ptr;
216 struct gcov_ctr_info *sci_ptr;
217 unsigned int fi_idx;
218 unsigned int ct_idx;
219 unsigned int val_idx;
220
221 for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
222 dci_ptr = dst->functions[fi_idx]->ctrs;
223 sci_ptr = src->functions[fi_idx]->ctrs;
224
225 for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
226 if (!counter_active(src, ct_idx))
227 continue;
228
229 for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
230 dci_ptr->values[val_idx] +=
231 sci_ptr->values[val_idx];
232
233 dci_ptr++;
234 sci_ptr++;
235 }
236 }
237}
238
239/**
240 * gcov_info_dup - duplicate profiling data set
241 * @info: profiling data set to duplicate
242 *
243 * Return newly allocated duplicate on success, %NULL on error.
244 */
245struct gcov_info *gcov_info_dup(struct gcov_info *info)
246{
247 struct gcov_info *dup;
248 struct gcov_ctr_info *dci_ptr; /* dst counter info */
249 struct gcov_ctr_info *sci_ptr; /* src counter info */
250 unsigned int active;
251 unsigned int fi_idx; /* function info idx */
252 unsigned int ct_idx; /* counter type idx */
253 size_t fi_size; /* function info size */
254 size_t cv_size; /* counter values size */
255
256 dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
257 if (!dup)
258 return NULL;
259
260 dup->next = NULL;
261 dup->filename = NULL;
262 dup->functions = NULL;
263
264 dup->filename = kstrdup(info->filename, GFP_KERNEL);
265 if (!dup->filename)
266 goto err_free;
267
268 dup->functions = kcalloc(info->n_functions,
269 sizeof(struct gcov_fn_info *), GFP_KERNEL);
270 if (!dup->functions)
271 goto err_free;
272
273 active = num_counter_active(info);
274 fi_size = sizeof(struct gcov_fn_info);
275 fi_size += sizeof(struct gcov_ctr_info) * active;
276
277 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
278 dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
279 if (!dup->functions[fi_idx])
280 goto err_free;
281
282 *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
283
284 sci_ptr = info->functions[fi_idx]->ctrs;
285 dci_ptr = dup->functions[fi_idx]->ctrs;
286
287 for (ct_idx = 0; ct_idx < active; ct_idx++) {
288
289 cv_size = sizeof(gcov_type) * sci_ptr->num;
290
291 dci_ptr->values = vmalloc(cv_size);
292
293 if (!dci_ptr->values)
294 goto err_free;
295
296 dci_ptr->num = sci_ptr->num;
297 memcpy(dci_ptr->values, sci_ptr->values, cv_size);
298
299 sci_ptr++;
300 dci_ptr++;
301 }
302 }
303
304 return dup;
305err_free:
306 gcov_info_free(dup);
307 return NULL;
308}
309
310/**
311 * gcov_info_free - release memory for profiling data set duplicate
312 * @info: profiling data set duplicate to free
313 */
314void gcov_info_free(struct gcov_info *info)
315{
316 unsigned int active;
317 unsigned int fi_idx;
318 unsigned int ct_idx;
319 struct gcov_ctr_info *ci_ptr;
320
321 if (!info->functions)
322 goto free_info;
323
324 active = num_counter_active(info);
325
326 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
327 if (!info->functions[fi_idx])
328 continue;
329
330 ci_ptr = info->functions[fi_idx]->ctrs;
331
332 for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
333 vfree(ci_ptr->values);
334
335 kfree(info->functions[fi_idx]);
336 }
337
338free_info:
339 kfree(info->functions);
340 kfree(info->filename);
341 kfree(info);
342}
343
344#define ITER_STRIDE PAGE_SIZE
345
346/**
347 * struct gcov_iterator - specifies current file position in logical records
348 * @info: associated profiling data
349 * @buffer: buffer containing file data
350 * @size: size of buffer
351 * @pos: current position in file
352 */
353struct gcov_iterator {
354 struct gcov_info *info;
355 void *buffer;
356 size_t size;
357 loff_t pos;
358};
359
360/**
361 * store_gcov_u32 - store 32 bit number in gcov format to buffer
362 * @buffer: target buffer or NULL
363 * @off: offset into the buffer
364 * @v: value to be stored
365 *
366 * Number format defined by gcc: numbers are recorded in the 32 bit
367 * unsigned binary form of the endianness of the machine generating the
368 * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
369 * store anything.
370 */
371static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
372{
373 u32 *data;
374
375 if (buffer) {
376 data = buffer + off;
377 *data = v;
378 }
379
380 return sizeof(*data);
381}
382
383/**
384 * store_gcov_u64 - store 64 bit number in gcov format to buffer
385 * @buffer: target buffer or NULL
386 * @off: offset into the buffer
387 * @v: value to be stored
388 *
389 * Number format defined by gcc: numbers are recorded in the 32 bit
390 * unsigned binary form of the endianness of the machine generating the
391 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
392 * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
393 * anything.
394 */
395static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
396{
397 u32 *data;
398
399 if (buffer) {
400 data = buffer + off;
401
402 data[0] = (v & 0xffffffffUL);
403 data[1] = (v >> 32);
404 }
405
406 return sizeof(*data) * 2;
407}
408
409/**
410 * convert_to_gcda - convert profiling data set to gcda file format
411 * @buffer: the buffer to store file data or %NULL if no data should be stored
412 * @info: profiling data set to be converted
413 *
414 * Returns the number of bytes that were/would have been stored into the buffer.
415 */
416static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
417{
418 struct gcov_fn_info *fi_ptr;
419 struct gcov_ctr_info *ci_ptr;
420 unsigned int fi_idx;
421 unsigned int ct_idx;
422 unsigned int cv_idx;
423 size_t pos = 0;
424
425 /* File header. */
426 pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
427 pos += store_gcov_u32(buffer, pos, info->version);
428 pos += store_gcov_u32(buffer, pos, info->stamp);
429
430 for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
431 fi_ptr = info->functions[fi_idx];
432
433 /* Function record. */
434 pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
435 pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
436 pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
437 pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
438 pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
439
440 ci_ptr = fi_ptr->ctrs;
441
442 for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
443 if (!counter_active(info, ct_idx))
444 continue;
445
446 /* Counter record. */
447 pos += store_gcov_u32(buffer, pos,
448 GCOV_TAG_FOR_COUNTER(ct_idx));
449 pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
450
451 for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
452 pos += store_gcov_u64(buffer, pos,
453 ci_ptr->values[cv_idx]);
454 }
455
456 ci_ptr++;
457 }
458 }
459
460 return pos;
461}
462
463/**
464 * gcov_iter_new - allocate and initialize profiling data iterator
465 * @info: profiling data set to be iterated
466 *
467 * Return file iterator on success, %NULL otherwise.
468 */
469struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
470{
471 struct gcov_iterator *iter;
472
473 iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
474 if (!iter)
475 goto err_free;
476
477 iter->info = info;
478 /* Dry-run to get the actual buffer size. */
479 iter->size = convert_to_gcda(NULL, info);
480 iter->buffer = vmalloc(iter->size);
481 if (!iter->buffer)
482 goto err_free;
483
484 convert_to_gcda(iter->buffer, info);
485
486 return iter;
487
488err_free:
489 kfree(iter);
490 return NULL;
491}
492
493
494/**
495 * gcov_iter_get_info - return profiling data set for given file iterator
496 * @iter: file iterator
497 */
498void gcov_iter_free(struct gcov_iterator *iter)
499{
500 vfree(iter->buffer);
501 kfree(iter);
502}
503
504/**
505 * gcov_iter_get_info - return profiling data set for given file iterator
506 * @iter: file iterator
507 */
508struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
509{
510 return iter->info;
511}
512
513/**
514 * gcov_iter_start - reset file iterator to starting position
515 * @iter: file iterator
516 */
517void gcov_iter_start(struct gcov_iterator *iter)
518{
519 iter->pos = 0;
520}
521
522/**
523 * gcov_iter_next - advance file iterator to next logical record
524 * @iter: file iterator
525 *
526 * Return zero if new position is valid, non-zero if iterator has reached end.
527 */
528int gcov_iter_next(struct gcov_iterator *iter)
529{
530 if (iter->pos < iter->size)
531 iter->pos += ITER_STRIDE;
532
533 if (iter->pos >= iter->size)
534 return -EINVAL;
535
536 return 0;
537}
538
539/**
540 * gcov_iter_write - write data for current pos to seq_file
541 * @iter: file iterator
542 * @seq: seq_file handle
543 *
544 * Return zero on success, non-zero otherwise.
545 */
546int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
547{
548 size_t len;
549
550 if (iter->pos >= iter->size)
551 return -EINVAL;
552
553 len = ITER_STRIDE;
554 if (iter->pos + len > iter->size)
555 len = iter->size - iter->pos;
556
557 seq_write(seq, iter->buffer + iter->pos, len);
558
559 return 0;
560}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a6..92c8e22a29ed 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,7 +21,6 @@
21 * gcc and need to be kept as close to the original definition as possible to 21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible. 22 * remain compatible.
23 */ 23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) 24#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) 25#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) 26#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
@@ -34,60 +33,18 @@ typedef long gcov_type;
34typedef long long gcov_type; 33typedef long long gcov_type;
35#endif 34#endif
36 35
37/** 36/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
38 * struct gcov_fn_info - profiling meta data per function 37 * we cannot use full definition here and they need to be placed in gcc specific
39 * @ident: object file-unique function identifier 38 * implementation of gcov. This also means no direct access to the members in
40 * @checksum: function checksum 39 * generic code and usage of the interface below.*/
41 * @n_ctrs: number of values per counter type belonging to this function 40struct gcov_info;
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66 41
67/** 42/* Interface to access gcov_info data */
68 * struct gcov_info - profiling data per object file 43const char *gcov_info_filename(struct gcov_info *info);
69 * @version: gcov version magic indicating the gcc version used for compilation 44unsigned int gcov_info_version(struct gcov_info *info);
70 * @next: list head for a singly-linked list 45struct gcov_info *gcov_info_next(struct gcov_info *info);
71 * @stamp: time stamp 46void gcov_info_link(struct gcov_info *info);
72 * @filename: name of the associated gcov data file 47void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91 48
92/* Base interface. */ 49/* Base interface. */
93enum gcov_action { 50enum gcov_action {
diff --git a/kernel/groups.c b/kernel/groups.c
index 6b2588dd04ff..90cf1c38c8ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!nsown_capable(CAP_SETGID)) 236 if (!ns_capable(current_user_ns(), CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3e97fb126e6b..9328b80eaf14 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,11 +16,12 @@
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/utsname.h> 18#include <linux/utsname.h>
19#include <trace/events/sched.h>
19 20
20/* 21/*
21 * The number of tasks checked: 22 * The number of tasks checked:
22 */ 23 */
23unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; 24int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
24 25
25/* 26/*
26 * Limit number of tasks checked in a batch. 27 * Limit number of tasks checked in a batch.
@@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
92 t->last_switch_count = switch_count; 93 t->last_switch_count = switch_count;
93 return; 94 return;
94 } 95 }
96
97 trace_sched_process_hang(t);
98
95 if (!sysctl_hung_task_warnings) 99 if (!sysctl_hung_task_warnings)
96 return; 100 return;
97 sysctl_hung_task_warnings--; 101 sysctl_hung_task_warnings--;
@@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
203 return ret; 207 return ret;
204} 208}
205 209
210static atomic_t reset_hung_task = ATOMIC_INIT(0);
211
212void reset_hung_task_detector(void)
213{
214 atomic_set(&reset_hung_task, 1);
215}
216EXPORT_SYMBOL_GPL(reset_hung_task_detector);
217
206/* 218/*
207 * kthread which checks for tasks stuck in D state 219 * kthread which checks for tasks stuck in D state
208 */ 220 */
@@ -216,6 +228,9 @@ static int watchdog(void *dummy)
216 while (schedule_timeout_interruptible(timeout_jiffies(timeout))) 228 while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
217 timeout = sysctl_hung_task_timeout_secs; 229 timeout = sysctl_hung_task_timeout_secs;
218 230
231 if (atomic_xchg(&reset_hung_task, 0))
232 continue;
233
219 check_hung_uninterruptible_tasks(timeout); 234 check_hung_uninterruptible_tasks(timeout);
220 } 235 }
221 236
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1a758bc972a..4a1fef09f658 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,15 +1,4 @@
1# Select this to activate the generic irq options below
2config HAVE_GENERIC_HARDIRQS
3 bool
4
5if HAVE_GENERIC_HARDIRQS
6menu "IRQ subsystem" 1menu "IRQ subsystem"
7#
8# Interrupt subsystem related configuration options
9#
10config GENERIC_HARDIRQS
11 def_bool y
12
13# Options selectable by the architecture code 2# Options selectable by the architecture code
14 3
15# Make sparse irq Kconfig switch below available 4# Make sparse irq Kconfig switch below available
@@ -84,4 +73,3 @@ config SPARSE_IRQ
84 If you don't know what to do here, say N. 73 If you don't know what to do here, say N.
85 74
86endmenu 75endmenu
87endif
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a3bb14fbe5c6..dc04c166c54d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc)
214} 214}
215 215
216/** 216/**
217 * irq_disable - Mark interupt disabled 217 * irq_disable - Mark interrupt disabled
218 * @desc: irq descriptor which should be disabled 218 * @desc: irq descriptor which should be disabled
219 * 219 *
220 * If the chip does not implement the irq_disable callback, we 220 * If the chip does not implement the irq_disable callback, we
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 706724e9835d..cf68bb36fe58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
465} 465}
466EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
467 467
468unsigned int irq_create_of_mapping(struct device_node *controller, 468unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
469 const u32 *intspec, unsigned int intsize)
470{ 469{
471 struct irq_domain *domain; 470 struct irq_domain *domain;
472 irq_hw_number_t hwirq; 471 irq_hw_number_t hwirq;
473 unsigned int type = IRQ_TYPE_NONE; 472 unsigned int type = IRQ_TYPE_NONE;
474 unsigned int virq; 473 unsigned int virq;
475 474
476 domain = controller ? irq_find_host(controller) : irq_default_domain; 475 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
477 if (!domain) { 476 if (!domain) {
478 pr_warn("no irq domain found for %s !\n", 477 pr_warn("no irq domain found for %s !\n",
479 of_node_full_name(controller)); 478 of_node_full_name(irq_data->np));
480 return 0; 479 return 0;
481 } 480 }
482 481
483 /* If domain has no translation, then we assume interrupt line */ 482 /* If domain has no translation, then we assume interrupt line */
484 if (domain->ops->xlate == NULL) 483 if (domain->ops->xlate == NULL)
485 hwirq = intspec[0]; 484 hwirq = irq_data->args[0];
486 else { 485 else {
487 if (domain->ops->xlate(domain, controller, intspec, intsize, 486 if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
488 &hwirq, &type)) 487 irq_data->args_count, &hwirq, &type))
489 return 0; 488 return 0;
490 } 489 }
491 490
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd855a8..481a13c43b17 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
786} 786}
787 787
788/* 788/*
789 * Interrupts explicitely requested as threaded interupts want to be 789 * Interrupts explicitly requested as threaded interrupts want to be
790 * preemtible - many of them need to sleep and wait for slow busses to 790 * preemtible - many of them need to sleep and wait for slow busses to
791 * complete. 791 * complete.
792 */ 792 */
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
956 goto out_mput; 956 goto out_mput;
957 } 957 }
958 958
959 sched_setscheduler(t, SCHED_FIFO, &param); 959 sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
960 960
961 /* 961 /*
962 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f1030f18..3320b84cc60f 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,6 +14,7 @@ enum {
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, 15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, 16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17 _IRQ_IS_POLLED = IRQ_IS_POLLED,
17 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 18 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
18}; 19};
19 20
@@ -26,6 +27,7 @@ enum {
26#define IRQ_NOAUTOEN GOT_YOU_MORON 27#define IRQ_NOAUTOEN GOT_YOU_MORON
27#define IRQ_NESTED_THREAD GOT_YOU_MORON 28#define IRQ_NESTED_THREAD GOT_YOU_MORON
28#define IRQ_PER_CPU_DEVID GOT_YOU_MORON 29#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
30#define IRQ_IS_POLLED GOT_YOU_MORON
29#undef IRQF_MODIFY_MASK 31#undef IRQF_MODIFY_MASK
30#define IRQF_MODIFY_MASK GOT_YOU_MORON 32#define IRQF_MODIFY_MASK GOT_YOU_MORON
31 33
@@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
147{ 149{
148 return desc->status_use_accessors & _IRQ_NESTED_THREAD; 150 return desc->status_use_accessors & _IRQ_NESTED_THREAD;
149} 151}
152
153static inline bool irq_settings_is_polled(struct irq_desc *desc)
154{
155 return desc->status_use_accessors & _IRQ_IS_POLLED;
156}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7b5f012bde9d..a1d8cc63b56e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
67 67
68 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
69 69
70 /* PER_CPU and nested thread interrupts are never polled */ 70 /*
71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) 71 * PER_CPU, nested thread interrupts and interrupts explicitely
72 * marked polled are excluded from polling.
73 */
74 if (irq_settings_is_per_cpu(desc) ||
75 irq_settings_is_nested_thread(desc) ||
76 irq_settings_is_polled(desc))
72 goto out; 77 goto out;
73 78
74 /* 79 /*
@@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
268void note_interrupt(unsigned int irq, struct irq_desc *desc, 273void note_interrupt(unsigned int irq, struct irq_desc *desc,
269 irqreturn_t action_ret) 274 irqreturn_t action_ret)
270{ 275{
271 if (desc->istate & IRQS_POLL_INPROGRESS) 276 if (desc->istate & IRQS_POLL_INPROGRESS ||
277 irq_settings_is_polled(desc))
272 return; 278 return;
273 279
274 /* we get here again via the threaded handler */ 280 /* we get here again via the threaded handler */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 297a9247a3b3..9019f15deab2 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable);
58 58
59void static_key_slow_inc(struct static_key *key) 59void static_key_slow_inc(struct static_key *key)
60{ 60{
61 STATIC_KEY_CHECK_USE();
61 if (atomic_inc_not_zero(&key->enabled)) 62 if (atomic_inc_not_zero(&key->enabled))
62 return; 63 return;
63 64
@@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work)
103 104
104void static_key_slow_dec(struct static_key *key) 105void static_key_slow_dec(struct static_key *key)
105{ 106{
107 STATIC_KEY_CHECK_USE();
106 __static_key_slow_dec(key, 0, NULL); 108 __static_key_slow_dec(key, 0, NULL);
107} 109}
108EXPORT_SYMBOL_GPL(static_key_slow_dec); 110EXPORT_SYMBOL_GPL(static_key_slow_dec);
109 111
110void static_key_slow_dec_deferred(struct static_key_deferred *key) 112void static_key_slow_dec_deferred(struct static_key_deferred *key)
111{ 113{
114 STATIC_KEY_CHECK_USE();
112 __static_key_slow_dec(&key->key, key->timeout, &key->work); 115 __static_key_slow_dec(&key->key, key->timeout, &key->work);
113} 116}
114EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); 117EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
@@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
116void jump_label_rate_limit(struct static_key_deferred *key, 119void jump_label_rate_limit(struct static_key_deferred *key,
117 unsigned long rl) 120 unsigned long rl)
118{ 121{
122 STATIC_KEY_CHECK_USE();
119 key->timeout = rl; 123 key->timeout = rl;
120 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); 124 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
121} 125}
@@ -212,6 +216,7 @@ void __init jump_label_init(void)
212 key->next = NULL; 216 key->next = NULL;
213#endif 217#endif
214 } 218 }
219 static_key_initialized = true;
215 jump_label_unlock(); 220 jump_label_unlock();
216} 221}
217 222
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba745..490afc03627e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -921,7 +921,7 @@ static int kimage_load_segment(struct kimage *image,
921 * reinitialize them. 921 * reinitialize them.
922 * 922 *
923 * - A machine specific part that includes the syscall number 923 * - A machine specific part that includes the syscall number
924 * and the copies the image to it's final destination. And 924 * and then copies the image to it's final destination. And
925 * jumps into the image at entry. 925 * jumps into the image at entry.
926 * 926 *
927 * kexec does not sync, or unmount filesystems so if you need 927 * kexec does not sync, or unmount filesystems so if you need
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline,
1474 if (first_colon && (!first_space || first_colon < first_space)) 1474 if (first_colon && (!first_space || first_colon < first_space))
1475 return parse_crashkernel_mem(ck_cmdline, system_ram, 1475 return parse_crashkernel_mem(ck_cmdline, system_ram,
1476 crash_size, crash_base); 1476 crash_size, crash_base);
1477 else
1478 return parse_crashkernel_simple(ck_cmdline, crash_size,
1479 crash_base);
1480 1477
1481 return 0; 1478 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1482} 1479}
1483 1480
1484/* 1481/*
diff --git a/kernel/kmod.c b/kernel/kmod.c
index fb326365b694..b086006c59e7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
571 DECLARE_COMPLETION_ONSTACK(done); 571 DECLARE_COMPLETION_ONSTACK(done);
572 int retval = 0; 572 int retval = 0;
573 573
574 if (!sub_info->path) {
575 call_usermodehelper_freeinfo(sub_info);
576 return -EINVAL;
577 }
574 helper_lock(); 578 helper_lock();
575 if (!khelper_wq || usermodehelper_disabled) { 579 if (!khelper_wq || usermodehelper_disabled) {
576 retval = -EBUSY; 580 retval = -EBUSY;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6e33498d665c..ceeadfcabb76 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
112struct kprobe_insn_page { 112struct kprobe_insn_page {
113 struct list_head list; 113 struct list_head list;
114 kprobe_opcode_t *insns; /* Page of instruction slots */ 114 kprobe_opcode_t *insns; /* Page of instruction slots */
115 struct kprobe_insn_cache *cache;
115 int nused; 116 int nused;
116 int ngarbage; 117 int ngarbage;
117 char slot_used[]; 118 char slot_used[];
@@ -121,12 +122,6 @@ struct kprobe_insn_page {
121 (offsetof(struct kprobe_insn_page, slot_used) + \ 122 (offsetof(struct kprobe_insn_page, slot_used) + \
122 (sizeof(char) * (slots))) 123 (sizeof(char) * (slots)))
123 124
124struct kprobe_insn_cache {
125 struct list_head pages; /* list of kprobe_insn_page */
126 size_t insn_size; /* size of instruction slot */
127 int nr_garbage;
128};
129
130static int slots_per_page(struct kprobe_insn_cache *c) 125static int slots_per_page(struct kprobe_insn_cache *c)
131{ 126{
132 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); 127 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,8 +133,20 @@ enum kprobe_slot_state {
138 SLOT_USED = 2, 133 SLOT_USED = 2,
139}; 134};
140 135
141static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ 136static void *alloc_insn_page(void)
142static struct kprobe_insn_cache kprobe_insn_slots = { 137{
138 return module_alloc(PAGE_SIZE);
139}
140
141static void free_insn_page(void *page)
142{
143 module_free(NULL, page);
144}
145
146struct kprobe_insn_cache kprobe_insn_slots = {
147 .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
148 .alloc = alloc_insn_page,
149 .free = free_insn_page,
143 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), 150 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
144 .insn_size = MAX_INSN_SIZE, 151 .insn_size = MAX_INSN_SIZE,
145 .nr_garbage = 0, 152 .nr_garbage = 0,
@@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
150 * __get_insn_slot() - Find a slot on an executable page for an instruction. 157 * __get_insn_slot() - Find a slot on an executable page for an instruction.
151 * We allocate an executable page if there's no room on existing ones. 158 * We allocate an executable page if there's no room on existing ones.
152 */ 159 */
153static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) 160kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
154{ 161{
155 struct kprobe_insn_page *kip; 162 struct kprobe_insn_page *kip;
163 kprobe_opcode_t *slot = NULL;
156 164
165 mutex_lock(&c->mutex);
157 retry: 166 retry:
158 list_for_each_entry(kip, &c->pages, list) { 167 list_for_each_entry(kip, &c->pages, list) {
159 if (kip->nused < slots_per_page(c)) { 168 if (kip->nused < slots_per_page(c)) {
@@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
162 if (kip->slot_used[i] == SLOT_CLEAN) { 171 if (kip->slot_used[i] == SLOT_CLEAN) {
163 kip->slot_used[i] = SLOT_USED; 172 kip->slot_used[i] = SLOT_USED;
164 kip->nused++; 173 kip->nused++;
165 return kip->insns + (i * c->insn_size); 174 slot = kip->insns + (i * c->insn_size);
175 goto out;
166 } 176 }
167 } 177 }
168 /* kip->nused is broken. Fix it. */ 178 /* kip->nused is broken. Fix it. */
@@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
178 /* All out of space. Need to allocate a new page. */ 188 /* All out of space. Need to allocate a new page. */
179 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); 189 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
180 if (!kip) 190 if (!kip)
181 return NULL; 191 goto out;
182 192
183 /* 193 /*
184 * Use module_alloc so this page is within +/- 2GB of where the 194 * Use module_alloc so this page is within +/- 2GB of where the
185 * kernel image and loaded module images reside. This is required 195 * kernel image and loaded module images reside. This is required
186 * so x86_64 can correctly handle the %rip-relative fixups. 196 * so x86_64 can correctly handle the %rip-relative fixups.
187 */ 197 */
188 kip->insns = module_alloc(PAGE_SIZE); 198 kip->insns = c->alloc();
189 if (!kip->insns) { 199 if (!kip->insns) {
190 kfree(kip); 200 kfree(kip);
191 return NULL; 201 goto out;
192 } 202 }
193 INIT_LIST_HEAD(&kip->list); 203 INIT_LIST_HEAD(&kip->list);
194 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); 204 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
195 kip->slot_used[0] = SLOT_USED; 205 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 206 kip->nused = 1;
197 kip->ngarbage = 0; 207 kip->ngarbage = 0;
208 kip->cache = c;
198 list_add(&kip->list, &c->pages); 209 list_add(&kip->list, &c->pages);
199 return kip->insns; 210 slot = kip->insns;
200} 211out:
201 212 mutex_unlock(&c->mutex);
202 213 return slot;
203kprobe_opcode_t __kprobes *get_insn_slot(void)
204{
205 kprobe_opcode_t *ret = NULL;
206
207 mutex_lock(&kprobe_insn_mutex);
208 ret = __get_insn_slot(&kprobe_insn_slots);
209 mutex_unlock(&kprobe_insn_mutex);
210
211 return ret;
212} 214}
213 215
214/* Return 1 if all garbages are collected, otherwise 0. */ 216/* Return 1 if all garbages are collected, otherwise 0. */
@@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
225 */ 227 */
226 if (!list_is_singular(&kip->list)) { 228 if (!list_is_singular(&kip->list)) {
227 list_del(&kip->list); 229 list_del(&kip->list);
228 module_free(NULL, kip->insns); 230 kip->cache->free(kip->insns);
229 kfree(kip); 231 kfree(kip);
230 } 232 }
231 return 1; 233 return 1;
@@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
255 return 0; 257 return 0;
256} 258}
257 259
258static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, 260void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
259 kprobe_opcode_t *slot, int dirty) 261 kprobe_opcode_t *slot, int dirty)
260{ 262{
261 struct kprobe_insn_page *kip; 263 struct kprobe_insn_page *kip;
262 264
265 mutex_lock(&c->mutex);
263 list_for_each_entry(kip, &c->pages, list) { 266 list_for_each_entry(kip, &c->pages, list) {
264 long idx = ((long)slot - (long)kip->insns) / 267 long idx = ((long)slot - (long)kip->insns) /
265 (c->insn_size * sizeof(kprobe_opcode_t)); 268 (c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
272 collect_garbage_slots(c); 275 collect_garbage_slots(c);
273 } else 276 } else
274 collect_one_slot(kip, idx); 277 collect_one_slot(kip, idx);
275 return; 278 goto out;
276 } 279 }
277 } 280 }
278 /* Could not free this slot. */ 281 /* Could not free this slot. */
279 WARN_ON(1); 282 WARN_ON(1);
283out:
284 mutex_unlock(&c->mutex);
280} 285}
281 286
282void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
283{
284 mutex_lock(&kprobe_insn_mutex);
285 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
286 mutex_unlock(&kprobe_insn_mutex);
287}
288#ifdef CONFIG_OPTPROBES 287#ifdef CONFIG_OPTPROBES
289/* For optimized_kprobe buffer */ 288/* For optimized_kprobe buffer */
290static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ 289struct kprobe_insn_cache kprobe_optinsn_slots = {
291static struct kprobe_insn_cache kprobe_optinsn_slots = { 290 .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
291 .alloc = alloc_insn_page,
292 .free = free_insn_page,
292 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), 293 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
293 /* .insn_size is initialized later */ 294 /* .insn_size is initialized later */
294 .nr_garbage = 0, 295 .nr_garbage = 0,
295}; 296};
296/* Get a slot for optimized_kprobe buffer */
297kprobe_opcode_t __kprobes *get_optinsn_slot(void)
298{
299 kprobe_opcode_t *ret = NULL;
300
301 mutex_lock(&kprobe_optinsn_mutex);
302 ret = __get_insn_slot(&kprobe_optinsn_slots);
303 mutex_unlock(&kprobe_optinsn_mutex);
304
305 return ret;
306}
307
308void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
309{
310 mutex_lock(&kprobe_optinsn_mutex);
311 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
312 mutex_unlock(&kprobe_optinsn_mutex);
313}
314#endif 297#endif
315#endif 298#endif
316 299
@@ -2083,7 +2066,7 @@ static int __init init_kprobes(void)
2083{ 2066{
2084 int i, err = 0; 2067 int i, err = 0;
2085 unsigned long offset = 0, size = 0; 2068 unsigned long offset = 0, size = 0;
2086 char *modname, namebuf[128]; 2069 char *modname, namebuf[KSYM_NAME_LEN];
2087 const char *symbol_name; 2070 const char *symbol_name;
2088 void *addr; 2071 void *addr;
2089 struct kprobe_blackpoint *kb; 2072 struct kprobe_blackpoint *kb;
@@ -2209,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2209 const char *sym = NULL; 2192 const char *sym = NULL;
2210 unsigned int i = *(loff_t *) v; 2193 unsigned int i = *(loff_t *) v;
2211 unsigned long offset = 0; 2194 unsigned long offset = 0;
2212 char *modname, namebuf[128]; 2195 char *modname, namebuf[KSYM_NAME_LEN];
2213 2196
2214 head = &kprobe_table[i]; 2197 head = &kprobe_table[i];
2215 preempt_disable(); 2198 preempt_disable();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c23a9a..9659d38e008f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
113 unsigned long cnt; 113 unsigned long cnt;
114 int ret; 114 int ret;
115 115
116 if (strict_strtoul(buf, 0, &cnt)) 116 if (kstrtoul(buf, 0, &cnt))
117 return -EINVAL; 117 return -EINVAL;
118 118
119 ret = crash_shrink_memory(cnt); 119 ret = crash_shrink_memory(cnt);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 760e86df8c20..b5ae3ee860a9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -33,7 +33,7 @@ struct kthread_create_info
33 33
34 /* Result passed back to kthread_create() from kthreadd. */ 34 /* Result passed back to kthread_create() from kthreadd. */
35 struct task_struct *result; 35 struct task_struct *result;
36 struct completion done; 36 struct completion *done;
37 37
38 struct list_head list; 38 struct list_head list;
39}; 39};
@@ -178,6 +178,7 @@ static int kthread(void *_create)
178 struct kthread_create_info *create = _create; 178 struct kthread_create_info *create = _create;
179 int (*threadfn)(void *data) = create->threadfn; 179 int (*threadfn)(void *data) = create->threadfn;
180 void *data = create->data; 180 void *data = create->data;
181 struct completion *done;
181 struct kthread self; 182 struct kthread self;
182 int ret; 183 int ret;
183 184
@@ -187,10 +188,16 @@ static int kthread(void *_create)
187 init_completion(&self.parked); 188 init_completion(&self.parked);
188 current->vfork_done = &self.exited; 189 current->vfork_done = &self.exited;
189 190
191 /* If user was SIGKILLed, I release the structure. */
192 done = xchg(&create->done, NULL);
193 if (!done) {
194 kfree(create);
195 do_exit(-EINTR);
196 }
190 /* OK, tell user we're spawned, wait for stop or wakeup */ 197 /* OK, tell user we're spawned, wait for stop or wakeup */
191 __set_current_state(TASK_UNINTERRUPTIBLE); 198 __set_current_state(TASK_UNINTERRUPTIBLE);
192 create->result = current; 199 create->result = current;
193 complete(&create->done); 200 complete(done);
194 schedule(); 201 schedule();
195 202
196 ret = -EINTR; 203 ret = -EINTR;
@@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create)
223 /* We want our own signal handler (we take no signals by default). */ 230 /* We want our own signal handler (we take no signals by default). */
224 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 231 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
225 if (pid < 0) { 232 if (pid < 0) {
233 /* If user was SIGKILLed, I release the structure. */
234 struct completion *done = xchg(&create->done, NULL);
235
236 if (!done) {
237 kfree(create);
238 return;
239 }
226 create->result = ERR_PTR(pid); 240 create->result = ERR_PTR(pid);
227 complete(&create->done); 241 complete(done);
228 } 242 }
229} 243}
230 244
@@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
255 const char namefmt[], 269 const char namefmt[],
256 ...) 270 ...)
257{ 271{
258 struct kthread_create_info create; 272 DECLARE_COMPLETION_ONSTACK(done);
259 273 struct task_struct *task;
260 create.threadfn = threadfn; 274 struct kthread_create_info *create = kmalloc(sizeof(*create),
261 create.data = data; 275 GFP_KERNEL);
262 create.node = node; 276
263 init_completion(&create.done); 277 if (!create)
278 return ERR_PTR(-ENOMEM);
279 create->threadfn = threadfn;
280 create->data = data;
281 create->node = node;
282 create->done = &done;
264 283
265 spin_lock(&kthread_create_lock); 284 spin_lock(&kthread_create_lock);
266 list_add_tail(&create.list, &kthread_create_list); 285 list_add_tail(&create->list, &kthread_create_list);
267 spin_unlock(&kthread_create_lock); 286 spin_unlock(&kthread_create_lock);
268 287
269 wake_up_process(kthreadd_task); 288 wake_up_process(kthreadd_task);
270 wait_for_completion(&create.done); 289 /*
271 290 * Wait for completion in killable state, for I might be chosen by
272 if (!IS_ERR(create.result)) { 291 * the OOM killer while kthreadd is trying to allocate memory for
292 * new kernel thread.
293 */
294 if (unlikely(wait_for_completion_killable(&done))) {
295 /*
296 * If I was SIGKILLed before kthreadd (or new kernel thread)
297 * calls complete(), leave the cleanup of this structure to
298 * that thread.
299 */
300 if (xchg(&create->done, NULL))
301 return ERR_PTR(-ENOMEM);
302 /*
303 * kthreadd (or new kernel thread) will call complete()
304 * shortly.
305 */
306 wait_for_completion(&done);
307 }
308 task = create->result;
309 if (!IS_ERR(task)) {
273 static const struct sched_param param = { .sched_priority = 0 }; 310 static const struct sched_param param = { .sched_priority = 0 };
274 va_list args; 311 va_list args;
275 312
276 va_start(args, namefmt); 313 va_start(args, namefmt);
277 vsnprintf(create.result->comm, sizeof(create.result->comm), 314 vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
278 namefmt, args);
279 va_end(args); 315 va_end(args);
280 /* 316 /*
281 * root may have changed our (kthreadd's) priority or CPU mask. 317 * root may have changed our (kthreadd's) priority or CPU mask.
282 * The kernel thread should not inherit these properties. 318 * The kernel thread should not inherit these properties.
283 */ 319 */
284 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 320 sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
285 set_cpus_allowed_ptr(create.result, cpu_all_mask); 321 set_cpus_allowed_ptr(task, cpu_all_mask);
286 } 322 }
287 return create.result; 323 kfree(create);
324 return task;
288} 325}
289EXPORT_SYMBOL(kthread_create_on_node); 326EXPORT_SYMBOL(kthread_create_on_node);
290 327
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
new file mode 100644
index 000000000000..baab8e5e7f66
--- /dev/null
+++ b/kernel/locking/Makefile
@@ -0,0 +1,25 @@
1
2obj-y += mutex.o semaphore.o rwsem.o lglock.o
3
4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg
6CFLAGS_REMOVE_lockdep_proc.o = -pg
7CFLAGS_REMOVE_mutex-debug.o = -pg
8CFLAGS_REMOVE_rtmutex-debug.o = -pg
9endif
10
11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
12obj-$(CONFIG_LOCKDEP) += lockdep.o
13ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif
16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
18obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
19obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
20obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
21obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
22obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
23obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
24obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
25obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
diff --git a/kernel/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/locking/lglock.c
diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c
index e16c45b9ee77..576ba756a32d 100644
--- a/kernel/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data)
1232 return 0; 1232 return 0;
1233} 1233}
1234 1234
1235unsigned long __lockdep_count_forward_deps(struct lock_list *this) 1235static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
1236{ 1236{
1237 unsigned long count = 0; 1237 unsigned long count = 0;
1238 struct lock_list *uninitialized_var(target_entry); 1238 struct lock_list *uninitialized_var(target_entry);
@@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1258 return ret; 1258 return ret;
1259} 1259}
1260 1260
1261unsigned long __lockdep_count_backward_deps(struct lock_list *this) 1261static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1262{ 1262{
1263 unsigned long count = 0; 1263 unsigned long count = 0;
1264 struct lock_list *uninitialized_var(target_entry); 1264 struct lock_list *uninitialized_var(target_entry);
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4225 !rcu_lockdep_current_cpu_online() 4225 !rcu_lockdep_current_cpu_online()
4226 ? "RCU used illegally from offline CPU!\n" 4226 ? "RCU used illegally from offline CPU!\n"
4227 : rcu_is_cpu_idle() 4227 : !rcu_is_watching()
4228 ? "RCU used illegally from idle CPU!\n" 4228 ? "RCU used illegally from idle CPU!\n"
4229 : "", 4229 : "",
4230 rcu_scheduler_active, debug_locks); 4230 rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4247 * So complain bitterly if someone does call rcu_read_lock(), 4247 * So complain bitterly if someone does call rcu_read_lock(),
4248 * rcu_read_lock_bh() and so on from extended quiescent states. 4248 * rcu_read_lock_bh() and so on from extended quiescent states.
4249 */ 4249 */
4250 if (rcu_is_cpu_idle()) 4250 if (!rcu_is_watching())
4251 printk("RCU used illegally from extended quiescent state!\n"); 4251 printk("RCU used illegally from extended quiescent state!\n");
4252 4252
4253 lockdep_print_held_locks(curr); 4253 lockdep_print_held_locks(curr);
diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b2c71c5873e4..ef43ac4bafb5 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
421 seq_time(m, lt->min); 421 seq_time(m, lt->min);
422 seq_time(m, lt->max); 422 seq_time(m, lt->max);
423 seq_time(m, lt->total); 423 seq_time(m, lt->total);
424 seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
424} 425}
425 426
426static void seq_stats(struct seq_file *m, struct lock_stat_data *data) 427static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
518 } 519 }
519 if (i) { 520 if (i) {
520 seq_puts(m, "\n"); 521 seq_puts(m, "\n");
521 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 522 seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
522 seq_puts(m, "\n"); 523 seq_puts(m, "\n");
523 } 524 }
524} 525}
525 526
526static void seq_header(struct seq_file *m) 527static void seq_header(struct seq_file *m)
527{ 528{
528 seq_printf(m, "lock_stat version 0.3\n"); 529 seq_puts(m, "lock_stat version 0.4\n");
529 530
530 if (unlikely(!debug_locks)) 531 if (unlikely(!debug_locks))
531 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); 532 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
532 533
533 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 534 seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
534 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 535 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
535 "%14s %14s\n", 536 "%14s %14s\n",
536 "class name", 537 "class name",
537 "con-bounces", 538 "con-bounces",
@@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m)
539 "waittime-min", 540 "waittime-min",
540 "waittime-max", 541 "waittime-max",
541 "waittime-total", 542 "waittime-total",
543 "waittime-avg",
542 "acq-bounces", 544 "acq-bounces",
543 "acquisitions", 545 "acquisitions",
544 "holdtime-min", 546 "holdtime-min",
545 "holdtime-max", 547 "holdtime-max",
546 "holdtime-total"); 548 "holdtime-total",
547 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 549 "holdtime-avg");
550 seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
548 seq_printf(m, "\n"); 551 seq_printf(m, "\n");
549} 552}
550 553
diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..995b0cc2b84c 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f48..7e3443fe1f48 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
diff --git a/kernel/mutex.c b/kernel/locking/mutex.c
index 6d647aedffea..4dd6e4c219de 100644
--- a/kernel/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/mutex.c 2 * kernel/locking/mutex.c
3 * 3 *
4 * Mutexes: blocking mutual exclusion locks 4 * Mutexes: blocking mutual exclusion locks
5 * 5 *
@@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
410static __always_inline int __sched 410static __always_inline int __sched
411__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 411__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
412 struct lockdep_map *nest_lock, unsigned long ip, 412 struct lockdep_map *nest_lock, unsigned long ip,
413 struct ww_acquire_ctx *ww_ctx) 413 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
414{ 414{
415 struct task_struct *task = current; 415 struct task_struct *task = current;
416 struct mutex_waiter waiter; 416 struct mutex_waiter waiter;
@@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
450 struct task_struct *owner; 450 struct task_struct *owner;
451 struct mspin_node node; 451 struct mspin_node node;
452 452
453 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 453 if (use_ww_ctx && ww_ctx->acquired > 0) {
454 struct ww_mutex *ww; 454 struct ww_mutex *ww;
455 455
456 ww = container_of(lock, struct ww_mutex, base); 456 ww = container_of(lock, struct ww_mutex, base);
@@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
480 if ((atomic_read(&lock->count) == 1) && 480 if ((atomic_read(&lock->count) == 1) &&
481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
482 lock_acquired(&lock->dep_map, ip); 482 lock_acquired(&lock->dep_map, ip);
483 if (!__builtin_constant_p(ww_ctx == NULL)) { 483 if (use_ww_ctx) {
484 struct ww_mutex *ww; 484 struct ww_mutex *ww;
485 ww = container_of(lock, struct ww_mutex, base); 485 ww = container_of(lock, struct ww_mutex, base);
486 486
@@ -551,7 +551,7 @@ slowpath:
551 goto err; 551 goto err;
552 } 552 }
553 553
554 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 554 if (use_ww_ctx && ww_ctx->acquired > 0) {
555 ret = __mutex_lock_check_stamp(lock, ww_ctx); 555 ret = __mutex_lock_check_stamp(lock, ww_ctx);
556 if (ret) 556 if (ret)
557 goto err; 557 goto err;
@@ -575,7 +575,7 @@ skip_wait:
575 lock_acquired(&lock->dep_map, ip); 575 lock_acquired(&lock->dep_map, ip);
576 mutex_set_owner(lock); 576 mutex_set_owner(lock);
577 577
578 if (!__builtin_constant_p(ww_ctx == NULL)) { 578 if (use_ww_ctx) {
579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
580 struct mutex_waiter *cur; 580 struct mutex_waiter *cur;
581 581
@@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
615{ 615{
616 might_sleep(); 616 might_sleep();
617 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 617 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
618 subclass, NULL, _RET_IP_, NULL); 618 subclass, NULL, _RET_IP_, NULL, 0);
619} 619}
620 620
621EXPORT_SYMBOL_GPL(mutex_lock_nested); 621EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
625{ 625{
626 might_sleep(); 626 might_sleep();
627 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 627 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
628 0, nest, _RET_IP_, NULL); 628 0, nest, _RET_IP_, NULL, 0);
629} 629}
630 630
631EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 631EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
635{ 635{
636 might_sleep(); 636 might_sleep();
637 return __mutex_lock_common(lock, TASK_KILLABLE, 637 return __mutex_lock_common(lock, TASK_KILLABLE,
638 subclass, NULL, _RET_IP_, NULL); 638 subclass, NULL, _RET_IP_, NULL, 0);
639} 639}
640EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 640EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
641 641
@@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
644{ 644{
645 might_sleep(); 645 might_sleep();
646 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 646 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
647 subclass, NULL, _RET_IP_, NULL); 647 subclass, NULL, _RET_IP_, NULL, 0);
648} 648}
649 649
650EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 650EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
682 682
683 might_sleep(); 683 might_sleep();
684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
685 0, &ctx->dep_map, _RET_IP_, ctx); 685 0, &ctx->dep_map, _RET_IP_, ctx, 1);
686 if (!ret && ctx->acquired > 1) 686 if (!ret && ctx->acquired > 1)
687 return ww_mutex_deadlock_injection(lock, ctx); 687 return ww_mutex_deadlock_injection(lock, ctx);
688 688
@@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
697 697
698 might_sleep(); 698 might_sleep();
699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
700 0, &ctx->dep_map, _RET_IP_, ctx); 700 0, &ctx->dep_map, _RET_IP_, ctx, 1);
701 701
702 if (!ret && ctx->acquired > 1) 702 if (!ret && ctx->acquired > 1)
703 return ww_mutex_deadlock_injection(lock, ctx); 703 return ww_mutex_deadlock_injection(lock, ctx);
@@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
809 struct mutex *lock = container_of(lock_count, struct mutex, count); 809 struct mutex *lock = container_of(lock_count, struct mutex, count);
810 810
811 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, 811 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
812 NULL, _RET_IP_, NULL); 812 NULL, _RET_IP_, NULL, 0);
813} 813}
814 814
815static noinline int __sched 815static noinline int __sched
816__mutex_lock_killable_slowpath(struct mutex *lock) 816__mutex_lock_killable_slowpath(struct mutex *lock)
817{ 817{
818 return __mutex_lock_common(lock, TASK_KILLABLE, 0, 818 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
819 NULL, _RET_IP_, NULL); 819 NULL, _RET_IP_, NULL, 0);
820} 820}
821 821
822static noinline int __sched 822static noinline int __sched
823__mutex_lock_interruptible_slowpath(struct mutex *lock) 823__mutex_lock_interruptible_slowpath(struct mutex *lock)
824{ 824{
825 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, 825 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
826 NULL, _RET_IP_, NULL); 826 NULL, _RET_IP_, NULL, 0);
827} 827}
828 828
829static noinline int __sched 829static noinline int __sched
830__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) 830__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
831{ 831{
832 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, 832 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
833 NULL, _RET_IP_, ctx); 833 NULL, _RET_IP_, ctx, 1);
834} 834}
835 835
836static noinline int __sched 836static noinline int __sched
@@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
838 struct ww_acquire_ctx *ctx) 838 struct ww_acquire_ctx *ctx)
839{ 839{
840 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, 840 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
841 NULL, _RET_IP_, ctx); 841 NULL, _RET_IP_, ctx, 1);
842} 842}
843 843
844#endif 844#endif
diff --git a/kernel/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/locking/mutex.h
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
new file mode 100644
index 000000000000..652a8ee8efe9
--- /dev/null
+++ b/kernel/locking/percpu-rwsem.c
@@ -0,0 +1,165 @@
1#include <linux/atomic.h>
2#include <linux/rwsem.h>
3#include <linux/percpu.h>
4#include <linux/wait.h>
5#include <linux/lockdep.h>
6#include <linux/percpu-rwsem.h>
7#include <linux/rcupdate.h>
8#include <linux/sched.h>
9#include <linux/errno.h>
10
11int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
12 const char *name, struct lock_class_key *rwsem_key)
13{
14 brw->fast_read_ctr = alloc_percpu(int);
15 if (unlikely(!brw->fast_read_ctr))
16 return -ENOMEM;
17
18 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
19 __init_rwsem(&brw->rw_sem, name, rwsem_key);
20 atomic_set(&brw->write_ctr, 0);
21 atomic_set(&brw->slow_read_ctr, 0);
22 init_waitqueue_head(&brw->write_waitq);
23 return 0;
24}
25
26void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
27{
28 free_percpu(brw->fast_read_ctr);
29 brw->fast_read_ctr = NULL; /* catch use after free bugs */
30}
31
32/*
33 * This is the fast-path for down_read/up_read, it only needs to ensure
34 * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
35 * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
36 * serialize with the preempt-disabled section below.
37 *
38 * The nontrivial part is that we should guarantee acquire/release semantics
39 * in case when
40 *
41 * R_W: down_write() comes after up_read(), the writer should see all
42 * changes done by the reader
43 * or
44 * W_R: down_read() comes after up_write(), the reader should see all
45 * changes done by the writer
46 *
47 * If this helper fails the callers rely on the normal rw_semaphore and
48 * atomic_dec_and_test(), so in this case we have the necessary barriers.
49 *
50 * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
51 * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
52 * reader inside the critical section. See the comments in down_write and
53 * up_write below.
54 */
55static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
56{
57 bool success = false;
58
59 preempt_disable();
60 if (likely(!atomic_read(&brw->write_ctr))) {
61 __this_cpu_add(*brw->fast_read_ctr, val);
62 success = true;
63 }
64 preempt_enable();
65
66 return success;
67}
68
69/*
70 * Like the normal down_read() this is not recursive, the writer can
71 * come after the first percpu_down_read() and create the deadlock.
72 *
73 * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
74 * percpu_up_read() does rwsem_release(). This pairs with the usage
75 * of ->rw_sem in percpu_down/up_write().
76 */
77void percpu_down_read(struct percpu_rw_semaphore *brw)
78{
79 might_sleep();
80 if (likely(update_fast_ctr(brw, +1))) {
81 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
82 return;
83 }
84
85 down_read(&brw->rw_sem);
86 atomic_inc(&brw->slow_read_ctr);
87 /* avoid up_read()->rwsem_release() */
88 __up_read(&brw->rw_sem);
89}
90
91void percpu_up_read(struct percpu_rw_semaphore *brw)
92{
93 rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
94
95 if (likely(update_fast_ctr(brw, -1)))
96 return;
97
98 /* false-positive is possible but harmless */
99 if (atomic_dec_and_test(&brw->slow_read_ctr))
100 wake_up_all(&brw->write_waitq);
101}
102
103static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
104{
105 unsigned int sum = 0;
106 int cpu;
107
108 for_each_possible_cpu(cpu) {
109 sum += per_cpu(*brw->fast_read_ctr, cpu);
110 per_cpu(*brw->fast_read_ctr, cpu) = 0;
111 }
112
113 return sum;
114}
115
116/*
117 * A writer increments ->write_ctr to force the readers to switch to the
118 * slow mode, note the atomic_read() check in update_fast_ctr().
119 *
120 * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
121 * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
122 * counter it represents the number of active readers.
123 *
124 * Finally the writer takes ->rw_sem for writing and blocks the new readers,
125 * then waits until the slow counter becomes zero.
126 */
127void percpu_down_write(struct percpu_rw_semaphore *brw)
128{
129 /* tell update_fast_ctr() there is a pending writer */
130 atomic_inc(&brw->write_ctr);
131 /*
132 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
133 * so that update_fast_ctr() can't succeed.
134 *
135 * 2. Ensures we see the result of every previous this_cpu_add() in
136 * update_fast_ctr().
137 *
138 * 3. Ensures that if any reader has exited its critical section via
139 * fast-path, it executes a full memory barrier before we return.
140 * See R_W case in the comment above update_fast_ctr().
141 */
142 synchronize_sched_expedited();
143
144 /* exclude other writers, and block the new readers completely */
145 down_write(&brw->rw_sem);
146
147 /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
148 atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
149
150 /* wait for all readers to complete their percpu_up_read() */
151 wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
152}
153
154void percpu_up_write(struct percpu_rw_semaphore *brw)
155{
156 /* release the lock, but the readers can't use the fast-path */
157 up_write(&brw->rw_sem);
158 /*
159 * Insert the barrier before the next fast-path in down_read,
160 * see W_R case in the comment above update_fast_ctr().
161 */
162 synchronize_sched_expedited();
163 /* the last writer unblocks update_fast_ctr() */
164 atomic_dec(&brw->write_ctr);
165}
diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..14193d596d78 100644
--- a/kernel/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
index 1d96dd0d93c1..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/locking/rtmutex-tester.c
diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/locking/rtmutex.c
diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..a1a1dd06421d 100644
--- a/kernel/rtmutex.h
+++ b/kernel/locking/rtmutex.h
diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
new file mode 100644
index 000000000000..9be8a9144978
--- /dev/null
+++ b/kernel/locking/rwsem-spinlock.c
@@ -0,0 +1,296 @@
1/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
2 * generic spinlock implementation
3 *
4 * Copyright (c) 2001 David Howells (dhowells@redhat.com).
5 * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
6 * - Derived also from comments by Linus
7 */
8#include <linux/rwsem.h>
9#include <linux/sched.h>
10#include <linux/export.h>
11
12enum rwsem_waiter_type {
13 RWSEM_WAITING_FOR_WRITE,
14 RWSEM_WAITING_FOR_READ
15};
16
17struct rwsem_waiter {
18 struct list_head list;
19 struct task_struct *task;
20 enum rwsem_waiter_type type;
21};
22
23int rwsem_is_locked(struct rw_semaphore *sem)
24{
25 int ret = 1;
26 unsigned long flags;
27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 }
32 return ret;
33}
34EXPORT_SYMBOL(rwsem_is_locked);
35
36/*
37 * initialise the semaphore
38 */
39void __init_rwsem(struct rw_semaphore *sem, const char *name,
40 struct lock_class_key *key)
41{
42#ifdef CONFIG_DEBUG_LOCK_ALLOC
43 /*
44 * Make sure we are not reinitializing a held semaphore:
45 */
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif
49 sem->activity = 0;
50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list);
52}
53EXPORT_SYMBOL(__init_rwsem);
54
55/*
56 * handle the lock release when processes blocked on it that can now run
57 * - if we come here, then:
58 * - the 'active count' _reached_ zero
59 * - the 'waiting count' is non-zero
60 * - the spinlock must be held by the caller
61 * - woken process blocks are discarded from the list after having task zeroed
62 * - writers are only woken if wakewrite is non-zero
63 */
64static inline struct rw_semaphore *
65__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
66{
67 struct rwsem_waiter *waiter;
68 struct task_struct *tsk;
69 int woken;
70
71 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
72
73 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
74 if (wakewrite)
75 /* Wake up a writer. Note that we do not grant it the
76 * lock - it will have to acquire it when it runs. */
77 wake_up_process(waiter->task);
78 goto out;
79 }
80
81 /* grant an infinite number of read locks to the front of the queue */
82 woken = 0;
83 do {
84 struct list_head *next = waiter->list.next;
85
86 list_del(&waiter->list);
87 tsk = waiter->task;
88 smp_mb();
89 waiter->task = NULL;
90 wake_up_process(tsk);
91 put_task_struct(tsk);
92 woken++;
93 if (next == &sem->wait_list)
94 break;
95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97
98 sem->activity += woken;
99
100 out:
101 return sem;
102}
103
104/*
105 * wake a single writer
106 */
107static inline struct rw_semaphore *
108__rwsem_wake_one_writer(struct rw_semaphore *sem)
109{
110 struct rwsem_waiter *waiter;
111
112 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
113 wake_up_process(waiter->task);
114
115 return sem;
116}
117
118/*
119 * get a read lock on the semaphore
120 */
121void __sched __down_read(struct rw_semaphore *sem)
122{
123 struct rwsem_waiter waiter;
124 struct task_struct *tsk;
125 unsigned long flags;
126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */
131 sem->activity++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out;
134 }
135
136 tsk = current;
137 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
138
139 /* set up my own style of waitqueue */
140 waiter.task = tsk;
141 waiter.type = RWSEM_WAITING_FOR_READ;
142 get_task_struct(tsk);
143
144 list_add_tail(&waiter.list, &sem->wait_list);
145
146 /* we don't need to touch the semaphore struct anymore */
147 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
148
149 /* wait to be given the lock */
150 for (;;) {
151 if (!waiter.task)
152 break;
153 schedule();
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 }
156
157 tsk->state = TASK_RUNNING;
158 out:
159 ;
160}
161
162/*
163 * trylock for reading -- returns 1 if successful, 0 if contention
164 */
165int __down_read_trylock(struct rw_semaphore *sem)
166{
167 unsigned long flags;
168 int ret = 0;
169
170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */
175 sem->activity++;
176 ret = 1;
177 }
178
179 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
180
181 return ret;
182}
183
184/*
185 * get a write lock on the semaphore
186 */
187void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
188{
189 struct rwsem_waiter waiter;
190 struct task_struct *tsk;
191 unsigned long flags;
192
193 raw_spin_lock_irqsave(&sem->wait_lock, flags);
194
195 /* set up my own style of waitqueue */
196 tsk = current;
197 waiter.task = tsk;
198 waiter.type = RWSEM_WAITING_FOR_WRITE;
199 list_add_tail(&waiter.list, &sem->wait_list);
200
201 /* wait for someone to release the lock */
202 for (;;) {
203 /*
204 * That is the key to support write lock stealing: allows the
205 * task already on CPU to get the lock soon rather than put
206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up.
208 */
209 if (sem->activity == 0)
210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
213 schedule();
214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 }
216 /* got the lock */
217 sem->activity = -1;
218 list_del(&waiter.list);
219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
221}
222
223void __sched __down_write(struct rw_semaphore *sem)
224{
225 __down_write_nested(sem, 0);
226}
227
228/*
229 * trylock for writing -- returns 1 if successful, 0 if contention
230 */
231int __down_write_trylock(struct rw_semaphore *sem)
232{
233 unsigned long flags;
234 int ret = 0;
235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237
238 if (sem->activity == 0) {
239 /* got the lock */
240 sem->activity = -1;
241 ret = 1;
242 }
243
244 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
245
246 return ret;
247}
248
249/*
250 * release a read lock on the semaphore
251 */
252void __up_read(struct rw_semaphore *sem)
253{
254 unsigned long flags;
255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem);
260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
262}
263
264/*
265 * release a write lock on the semaphore
266 */
267void __up_write(struct rw_semaphore *sem)
268{
269 unsigned long flags;
270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272
273 sem->activity = 0;
274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1);
276
277 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
278}
279
280/*
281 * downgrade a write lock into a read lock
282 * - just wake up any readers at the front of the queue
283 */
284void __downgrade_write(struct rw_semaphore *sem)
285{
286 unsigned long flags;
287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289
290 sem->activity = 1;
291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0);
293
294 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
295}
296
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
new file mode 100644
index 000000000000..19c5fa95e0b4
--- /dev/null
+++ b/kernel/locking/rwsem-xadd.c
@@ -0,0 +1,293 @@
1/* rwsem.c: R/W semaphores: contention handling functions
2 *
3 * Written by David Howells (dhowells@redhat.com).
4 * Derived from arch/i386/kernel/semaphore.c
5 *
6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
7 * and Michel Lespinasse <walken@google.com>
8 */
9#include <linux/rwsem.h>
10#include <linux/sched.h>
11#include <linux/init.h>
12#include <linux/export.h>
13
14/*
15 * Initialize an rwsem:
16 */
17void __init_rwsem(struct rw_semaphore *sem, const char *name,
18 struct lock_class_key *key)
19{
20#ifdef CONFIG_DEBUG_LOCK_ALLOC
21 /*
22 * Make sure we are not reinitializing a held semaphore:
23 */
24 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
25 lockdep_init_map(&sem->dep_map, name, key, 0);
26#endif
27 sem->count = RWSEM_UNLOCKED_VALUE;
28 raw_spin_lock_init(&sem->wait_lock);
29 INIT_LIST_HEAD(&sem->wait_list);
30}
31
32EXPORT_SYMBOL(__init_rwsem);
33
34enum rwsem_waiter_type {
35 RWSEM_WAITING_FOR_WRITE,
36 RWSEM_WAITING_FOR_READ
37};
38
39struct rwsem_waiter {
40 struct list_head list;
41 struct task_struct *task;
42 enum rwsem_waiter_type type;
43};
44
45enum rwsem_wake_type {
46 RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
47 RWSEM_WAKE_READERS, /* Wake readers only */
48 RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
49};
50
51/*
52 * handle the lock release when processes blocked on it that can now run
53 * - if we come here from up_xxxx(), then:
54 * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
55 * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
56 * - there must be someone on the queue
57 * - the spinlock must be held by the caller
58 * - woken process blocks are discarded from the list after having task zeroed
59 * - writers are only woken if downgrading is false
60 */
61static struct rw_semaphore *
62__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
63{
64 struct rwsem_waiter *waiter;
65 struct task_struct *tsk;
66 struct list_head *next;
67 long oldcount, woken, loop, adjustment;
68
69 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
70 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
71 if (wake_type == RWSEM_WAKE_ANY)
72 /* Wake writer at the front of the queue, but do not
73 * grant it the lock yet as we want other writers
74 * to be able to steal it. Readers, on the other hand,
75 * will block as they will notice the queued writer.
76 */
77 wake_up_process(waiter->task);
78 goto out;
79 }
80
81 /* Writers might steal the lock before we grant it to the next reader.
82 * We prefer to do the first reader grant before counting readers
83 * so we can bail out early if a writer stole the lock.
84 */
85 adjustment = 0;
86 if (wake_type != RWSEM_WAKE_READ_OWNED) {
87 adjustment = RWSEM_ACTIVE_READ_BIAS;
88 try_reader_grant:
89 oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
90 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
91 /* A writer stole the lock. Undo our reader grant. */
92 if (rwsem_atomic_update(-adjustment, sem) &
93 RWSEM_ACTIVE_MASK)
94 goto out;
95 /* Last active locker left. Retry waking readers. */
96 goto try_reader_grant;
97 }
98 }
99
100 /* Grant an infinite number of read locks to the readers at the front
101 * of the queue. Note we increment the 'active part' of the count by
102 * the number of readers before waking any processes up.
103 */
104 woken = 0;
105 do {
106 woken++;
107
108 if (waiter->list.next == &sem->wait_list)
109 break;
110
111 waiter = list_entry(waiter->list.next,
112 struct rwsem_waiter, list);
113
114 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
115
116 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
117 if (waiter->type != RWSEM_WAITING_FOR_WRITE)
118 /* hit end of list above */
119 adjustment -= RWSEM_WAITING_BIAS;
120
121 if (adjustment)
122 rwsem_atomic_add(adjustment, sem);
123
124 next = sem->wait_list.next;
125 loop = woken;
126 do {
127 waiter = list_entry(next, struct rwsem_waiter, list);
128 next = waiter->list.next;
129 tsk = waiter->task;
130 smp_mb();
131 waiter->task = NULL;
132 wake_up_process(tsk);
133 put_task_struct(tsk);
134 } while (--loop);
135
136 sem->wait_list.next = next;
137 next->prev = &sem->wait_list;
138
139 out:
140 return sem;
141}
142
143/*
144 * wait for the read lock to be granted
145 */
146struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
147{
148 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
149 struct rwsem_waiter waiter;
150 struct task_struct *tsk = current;
151
152 /* set up my own style of waitqueue */
153 waiter.task = tsk;
154 waiter.type = RWSEM_WAITING_FOR_READ;
155 get_task_struct(tsk);
156
157 raw_spin_lock_irq(&sem->wait_lock);
158 if (list_empty(&sem->wait_list))
159 adjustment += RWSEM_WAITING_BIAS;
160 list_add_tail(&waiter.list, &sem->wait_list);
161
162 /* we're now waiting on the lock, but no longer actively locking */
163 count = rwsem_atomic_update(adjustment, sem);
164
165 /* If there are no active locks, wake the front queued process(es).
166 *
167 * If there are no writers and we are first in the queue,
168 * wake our own waiter to join the existing active readers !
169 */
170 if (count == RWSEM_WAITING_BIAS ||
171 (count > RWSEM_WAITING_BIAS &&
172 adjustment != -RWSEM_ACTIVE_READ_BIAS))
173 sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
174
175 raw_spin_unlock_irq(&sem->wait_lock);
176
177 /* wait to be given the lock */
178 while (true) {
179 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
180 if (!waiter.task)
181 break;
182 schedule();
183 }
184
185 tsk->state = TASK_RUNNING;
186
187 return sem;
188}
189
190/*
191 * wait until we successfully acquire the write lock
192 */
193struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
194{
195 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
196 struct rwsem_waiter waiter;
197 struct task_struct *tsk = current;
198
199 /* set up my own style of waitqueue */
200 waiter.task = tsk;
201 waiter.type = RWSEM_WAITING_FOR_WRITE;
202
203 raw_spin_lock_irq(&sem->wait_lock);
204 if (list_empty(&sem->wait_list))
205 adjustment += RWSEM_WAITING_BIAS;
206 list_add_tail(&waiter.list, &sem->wait_list);
207
208 /* we're now waiting on the lock, but no longer actively locking */
209 count = rwsem_atomic_update(adjustment, sem);
210
211 /* If there were already threads queued before us and there are no
212 * active writers, the lock must be read owned; so we try to wake
213 * any read locks that were queued ahead of us. */
214 if (count > RWSEM_WAITING_BIAS &&
215 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
216 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
217
218 /* wait until we successfully acquire the lock */
219 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
220 while (true) {
221 if (!(count & RWSEM_ACTIVE_MASK)) {
222 /* Try acquiring the write lock. */
223 count = RWSEM_ACTIVE_WRITE_BIAS;
224 if (!list_is_singular(&sem->wait_list))
225 count += RWSEM_WAITING_BIAS;
226
227 if (sem->count == RWSEM_WAITING_BIAS &&
228 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
229 RWSEM_WAITING_BIAS)
230 break;
231 }
232
233 raw_spin_unlock_irq(&sem->wait_lock);
234
235 /* Block until there are no active lockers. */
236 do {
237 schedule();
238 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
239 } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
240
241 raw_spin_lock_irq(&sem->wait_lock);
242 }
243
244 list_del(&waiter.list);
245 raw_spin_unlock_irq(&sem->wait_lock);
246 tsk->state = TASK_RUNNING;
247
248 return sem;
249}
250
251/*
252 * handle waking up a waiter on the semaphore
253 * - up_read/up_write has decremented the active part of count if we come here
254 */
255struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
256{
257 unsigned long flags;
258
259 raw_spin_lock_irqsave(&sem->wait_lock, flags);
260
261 /* do nothing if list empty */
262 if (!list_empty(&sem->wait_list))
263 sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
264
265 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
266
267 return sem;
268}
269
270/*
271 * downgrade a write lock into a read lock
272 * - caller incremented waiting part of count and discovered it still negative
273 * - just wake up any readers at the front of the queue
274 */
275struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
276{
277 unsigned long flags;
278
279 raw_spin_lock_irqsave(&sem->wait_lock, flags);
280
281 /* do nothing if list empty */
282 if (!list_empty(&sem->wait_list))
283 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
284
285 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
286
287 return sem;
288}
289
290EXPORT_SYMBOL(rwsem_down_read_failed);
291EXPORT_SYMBOL(rwsem_down_write_failed);
292EXPORT_SYMBOL(rwsem_wake);
293EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/locking/rwsem.c
diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/locking/semaphore.c
diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c
index 5cdd8065a3ce..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -34,6 +34,20 @@
34#else 34#else
35#define raw_read_can_lock(l) read_can_lock(l) 35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l) 36#define raw_write_can_lock(l) write_can_lock(l)
37
38/*
39 * Some architectures can relax in favour of the CPU owning the lock.
40 */
41#ifndef arch_read_relax
42# define arch_read_relax(l) cpu_relax()
43#endif
44#ifndef arch_write_relax
45# define arch_write_relax(l) cpu_relax()
46#endif
47#ifndef arch_spin_relax
48# define arch_spin_relax(l) cpu_relax()
49#endif
50
37/* 51/*
38 * We build the __lock_function inlines here. They are too large for 52 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function 53 * inlining all over the place, but here is only one user per function
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
new file mode 100644
index 000000000000..0374a596cffa
--- /dev/null
+++ b/kernel/locking/spinlock_debug.c
@@ -0,0 +1,302 @@
1/*
2 * Copyright 2005, Red Hat, Inc., Ingo Molnar
3 * Released under the General Public License (GPL).
4 *
5 * This file contains the spinlock/rwlock implementations for
6 * DEBUG_SPINLOCK.
7 */
8
9#include <linux/spinlock.h>
10#include <linux/nmi.h>
11#include <linux/interrupt.h>
12#include <linux/debug_locks.h>
13#include <linux/delay.h>
14#include <linux/export.h>
15
16void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
17 struct lock_class_key *key)
18{
19#ifdef CONFIG_DEBUG_LOCK_ALLOC
20 /*
21 * Make sure we are not reinitializing a held lock:
22 */
23 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
24 lockdep_init_map(&lock->dep_map, name, key, 0);
25#endif
26 lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
27 lock->magic = SPINLOCK_MAGIC;
28 lock->owner = SPINLOCK_OWNER_INIT;
29 lock->owner_cpu = -1;
30}
31
32EXPORT_SYMBOL(__raw_spin_lock_init);
33
34void __rwlock_init(rwlock_t *lock, const char *name,
35 struct lock_class_key *key)
36{
37#ifdef CONFIG_DEBUG_LOCK_ALLOC
38 /*
39 * Make sure we are not reinitializing a held lock:
40 */
41 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
42 lockdep_init_map(&lock->dep_map, name, key, 0);
43#endif
44 lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
45 lock->magic = RWLOCK_MAGIC;
46 lock->owner = SPINLOCK_OWNER_INIT;
47 lock->owner_cpu = -1;
48}
49
50EXPORT_SYMBOL(__rwlock_init);
51
52static void spin_dump(raw_spinlock_t *lock, const char *msg)
53{
54 struct task_struct *owner = NULL;
55
56 if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
57 owner = lock->owner;
58 printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
59 msg, raw_smp_processor_id(),
60 current->comm, task_pid_nr(current));
61 printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
62 ".owner_cpu: %d\n",
63 lock, lock->magic,
64 owner ? owner->comm : "<none>",
65 owner ? task_pid_nr(owner) : -1,
66 lock->owner_cpu);
67 dump_stack();
68}
69
70static void spin_bug(raw_spinlock_t *lock, const char *msg)
71{
72 if (!debug_locks_off())
73 return;
74
75 spin_dump(lock, msg);
76}
77
78#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
79
80static inline void
81debug_spin_lock_before(raw_spinlock_t *lock)
82{
83 SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
84 SPIN_BUG_ON(lock->owner == current, lock, "recursion");
85 SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
86 lock, "cpu recursion");
87}
88
89static inline void debug_spin_lock_after(raw_spinlock_t *lock)
90{
91 lock->owner_cpu = raw_smp_processor_id();
92 lock->owner = current;
93}
94
95static inline void debug_spin_unlock(raw_spinlock_t *lock)
96{
97 SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
98 SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
99 SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
100 SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
101 lock, "wrong CPU");
102 lock->owner = SPINLOCK_OWNER_INIT;
103 lock->owner_cpu = -1;
104}
105
106static void __spin_lock_debug(raw_spinlock_t *lock)
107{
108 u64 i;
109 u64 loops = loops_per_jiffy * HZ;
110
111 for (i = 0; i < loops; i++) {
112 if (arch_spin_trylock(&lock->raw_lock))
113 return;
114 __delay(1);
115 }
116 /* lockup suspected: */
117 spin_dump(lock, "lockup suspected");
118#ifdef CONFIG_SMP
119 trigger_all_cpu_backtrace();
120#endif
121
122 /*
123 * The trylock above was causing a livelock. Give the lower level arch
124 * specific lock code a chance to acquire the lock. We have already
125 * printed a warning/backtrace at this point. The non-debug arch
126 * specific code might actually succeed in acquiring the lock. If it is
127 * not successful, the end-result is the same - there is no forward
128 * progress.
129 */
130 arch_spin_lock(&lock->raw_lock);
131}
132
133void do_raw_spin_lock(raw_spinlock_t *lock)
134{
135 debug_spin_lock_before(lock);
136 if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
137 __spin_lock_debug(lock);
138 debug_spin_lock_after(lock);
139}
140
141int do_raw_spin_trylock(raw_spinlock_t *lock)
142{
143 int ret = arch_spin_trylock(&lock->raw_lock);
144
145 if (ret)
146 debug_spin_lock_after(lock);
147#ifndef CONFIG_SMP
148 /*
149 * Must not happen on UP:
150 */
151 SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
152#endif
153 return ret;
154}
155
156void do_raw_spin_unlock(raw_spinlock_t *lock)
157{
158 debug_spin_unlock(lock);
159 arch_spin_unlock(&lock->raw_lock);
160}
161
162static void rwlock_bug(rwlock_t *lock, const char *msg)
163{
164 if (!debug_locks_off())
165 return;
166
167 printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
168 msg, raw_smp_processor_id(), current->comm,
169 task_pid_nr(current), lock);
170 dump_stack();
171}
172
173#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
174
175#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
176static void __read_lock_debug(rwlock_t *lock)
177{
178 u64 i;
179 u64 loops = loops_per_jiffy * HZ;
180 int print_once = 1;
181
182 for (;;) {
183 for (i = 0; i < loops; i++) {
184 if (arch_read_trylock(&lock->raw_lock))
185 return;
186 __delay(1);
187 }
188 /* lockup suspected: */
189 if (print_once) {
190 print_once = 0;
191 printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
192 "%s/%d, %p\n",
193 raw_smp_processor_id(), current->comm,
194 current->pid, lock);
195 dump_stack();
196 }
197 }
198}
199#endif
200
201void do_raw_read_lock(rwlock_t *lock)
202{
203 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
204 arch_read_lock(&lock->raw_lock);
205}
206
207int do_raw_read_trylock(rwlock_t *lock)
208{
209 int ret = arch_read_trylock(&lock->raw_lock);
210
211#ifndef CONFIG_SMP
212 /*
213 * Must not happen on UP:
214 */
215 RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
216#endif
217 return ret;
218}
219
220void do_raw_read_unlock(rwlock_t *lock)
221{
222 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
223 arch_read_unlock(&lock->raw_lock);
224}
225
226static inline void debug_write_lock_before(rwlock_t *lock)
227{
228 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
229 RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
230 RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
231 lock, "cpu recursion");
232}
233
234static inline void debug_write_lock_after(rwlock_t *lock)
235{
236 lock->owner_cpu = raw_smp_processor_id();
237 lock->owner = current;
238}
239
240static inline void debug_write_unlock(rwlock_t *lock)
241{
242 RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
243 RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
244 RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
245 lock, "wrong CPU");
246 lock->owner = SPINLOCK_OWNER_INIT;
247 lock->owner_cpu = -1;
248}
249
250#if 0 /* This can cause lockups */
251static void __write_lock_debug(rwlock_t *lock)
252{
253 u64 i;
254 u64 loops = loops_per_jiffy * HZ;
255 int print_once = 1;
256
257 for (;;) {
258 for (i = 0; i < loops; i++) {
259 if (arch_write_trylock(&lock->raw_lock))
260 return;
261 __delay(1);
262 }
263 /* lockup suspected: */
264 if (print_once) {
265 print_once = 0;
266 printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
267 "%s/%d, %p\n",
268 raw_smp_processor_id(), current->comm,
269 current->pid, lock);
270 dump_stack();
271 }
272 }
273}
274#endif
275
276void do_raw_write_lock(rwlock_t *lock)
277{
278 debug_write_lock_before(lock);
279 arch_write_lock(&lock->raw_lock);
280 debug_write_lock_after(lock);
281}
282
283int do_raw_write_trylock(rwlock_t *lock)
284{
285 int ret = arch_write_trylock(&lock->raw_lock);
286
287 if (ret)
288 debug_write_lock_after(lock);
289#ifndef CONFIG_SMP
290 /*
291 * Must not happen on UP:
292 */
293 RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
294#endif
295 return ret;
296}
297
298void do_raw_write_unlock(rwlock_t *lock)
299{
300 debug_write_unlock(lock);
301 arch_write_unlock(&lock->raw_lock);
302}
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
deleted file mode 100644
index 4a9a86d12c8b..000000000000
--- a/kernel/modsign_certificate.S
+++ /dev/null
@@ -1,12 +0,0 @@
1#include <linux/export.h>
2
3#define GLOBAL(name) \
4 .globl VMLINUX_SYMBOL(name); \
5 VMLINUX_SYMBOL(name):
6
7 .section ".init.data","aw"
8
9GLOBAL(modsign_certificate_list)
10 .incbin "signing_key.x509"
11 .incbin "extra_certificates"
12GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
deleted file mode 100644
index 2b6e69909c39..000000000000
--- a/kernel/modsign_pubkey.c
+++ /dev/null
@@ -1,104 +0,0 @@
1/* Public keys for module signature verification
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/cred.h>
15#include <linux/err.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19struct key *modsign_keyring;
20
21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[];
23
24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes.
27 */
28static __initdata const char annoy_ccache[] = __TIME__ "foo";
29
30/*
31 * Load the compiled-in keys
32 */
33static __init int module_verify_init(void)
34{
35 pr_notice("Initialise module verification\n");
36
37 modsign_keyring = keyring_alloc(".module_sign",
38 KUIDT_INIT(0), KGIDT_INIT(0),
39 current_cred(),
40 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
41 KEY_USR_VIEW | KEY_USR_READ),
42 KEY_ALLOC_NOT_IN_QUOTA, NULL);
43 if (IS_ERR(modsign_keyring))
44 panic("Can't allocate module signing keyring\n");
45
46 return 0;
47}
48
49/*
50 * Must be initialised before we try and load the keys into the keyring.
51 */
52device_initcall(module_verify_init);
53
54/*
55 * Load the compiled-in keys
56 */
57static __init int load_module_signing_keys(void)
58{
59 key_ref_t key;
60 const u8 *p, *end;
61 size_t plen;
62
63 pr_notice("Loading module verification certificates\n");
64
65 end = modsign_certificate_list_end;
66 p = modsign_certificate_list;
67 while (p < end) {
68 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
69 * than 256 bytes in size.
70 */
71 if (end - p < 4)
72 goto dodgy_cert;
73 if (p[0] != 0x30 &&
74 p[1] != 0x82)
75 goto dodgy_cert;
76 plen = (p[2] << 8) | p[3];
77 plen += 4;
78 if (plen > end - p)
79 goto dodgy_cert;
80
81 key = key_create_or_update(make_key_ref(modsign_keyring, 1),
82 "asymmetric",
83 NULL,
84 p,
85 plen,
86 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
87 KEY_USR_VIEW,
88 KEY_ALLOC_NOT_IN_QUOTA);
89 if (IS_ERR(key))
90 pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
91 PTR_ERR(key));
92 else
93 pr_notice("MODSIGN: Loaded cert '%s'\n",
94 key_ref_to_ptr(key)->description);
95 p += plen;
96 }
97
98 return 0;
99
100dodgy_cert:
101 pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
102 return 0;
103}
104late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 24f9247b7d02..915e123a430f 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -9,6 +9,4 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12extern struct key *modsign_keyring;
13
14extern int mod_verify_sig(const void *mod, unsigned long *_modlen); 12extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index dc582749fa13..f5a3b1e8ec51 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms,
378 if (syms->licence == GPL_ONLY) 378 if (syms->licence == GPL_ONLY)
379 return false; 379 return false;
380 if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { 380 if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
381 printk(KERN_WARNING "Symbol %s is being used " 381 pr_warn("Symbol %s is being used by a non-GPL module, "
382 "by a non-GPL module, which will not " 382 "which will not be allowed in the future\n",
383 "be allowed in the future\n", fsa->name); 383 fsa->name);
384 } 384 }
385 } 385 }
386 386
387#ifdef CONFIG_UNUSED_SYMBOLS 387#ifdef CONFIG_UNUSED_SYMBOLS
388 if (syms->unused && fsa->warn) { 388 if (syms->unused && fsa->warn) {
389 printk(KERN_WARNING "Symbol %s is marked as UNUSED, " 389 pr_warn("Symbol %s is marked as UNUSED, however this module is "
390 "however this module is using it.\n", fsa->name); 390 "using it.\n", fsa->name);
391 printk(KERN_WARNING 391 pr_warn("This symbol will go away in the future.\n");
392 "This symbol will go away in the future.\n"); 392 pr_warn("Please evalute if this is the right api to use and if "
393 printk(KERN_WARNING 393 "it really is, submit a report the linux kernel "
394 "Please evalute if this is the right api to use and if " 394 "mailinglist together with submitting your code for "
395 "it really is, submit a report the linux kernel " 395 "inclusion.\n");
396 "mailinglist together with submitting your code for "
397 "inclusion.\n");
398 } 396 }
399#endif 397#endif
400 398
@@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info)
492 return 0; 490 return 0;
493 491
494 if (align > PAGE_SIZE) { 492 if (align > PAGE_SIZE) {
495 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 493 pr_warn("%s: per-cpu alignment %li > %li\n",
496 mod->name, align, PAGE_SIZE); 494 mod->name, align, PAGE_SIZE);
497 align = PAGE_SIZE; 495 align = PAGE_SIZE;
498 } 496 }
499 497
500 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); 498 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
501 if (!mod->percpu) { 499 if (!mod->percpu) {
502 printk(KERN_WARNING 500 pr_warn("%s: Could not allocate %lu bytes percpu data\n",
503 "%s: Could not allocate %lu bytes percpu data\n", 501 mod->name, (unsigned long)pcpusec->sh_size);
504 mod->name, (unsigned long)pcpusec->sh_size);
505 return -ENOMEM; 502 return -ENOMEM;
506 } 503 }
507 mod->percpu_size = pcpusec->sh_size; 504 mod->percpu_size = pcpusec->sh_size;
@@ -644,8 +641,6 @@ static int module_unload_init(struct module *mod)
644 641
645 /* Hold reference count during initialization. */ 642 /* Hold reference count during initialization. */
646 __this_cpu_write(mod->refptr->incs, 1); 643 __this_cpu_write(mod->refptr->incs, 1);
647 /* Backwards compatibility macros put refcount during init. */
648 mod->waiter = current;
649 644
650 return 0; 645 return 0;
651} 646}
@@ -679,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b)
679 pr_debug("Allocating new usage for %s.\n", a->name); 674 pr_debug("Allocating new usage for %s.\n", a->name);
680 use = kmalloc(sizeof(*use), GFP_ATOMIC); 675 use = kmalloc(sizeof(*use), GFP_ATOMIC);
681 if (!use) { 676 if (!use) {
682 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 677 pr_warn("%s: out of memory loading\n", a->name);
683 return -ENOMEM; 678 return -ENOMEM;
684 } 679 }
685 680
@@ -771,16 +766,9 @@ static int __try_stop_module(void *_sref)
771 766
772static int try_stop_module(struct module *mod, int flags, int *forced) 767static int try_stop_module(struct module *mod, int flags, int *forced)
773{ 768{
774 if (flags & O_NONBLOCK) { 769 struct stopref sref = { mod, flags, forced };
775 struct stopref sref = { mod, flags, forced };
776 770
777 return stop_machine(__try_stop_module, &sref, NULL); 771 return stop_machine(__try_stop_module, &sref, NULL);
778 } else {
779 /* We don't need to stop the machine for this. */
780 mod->state = MODULE_STATE_GOING;
781 synchronize_sched();
782 return 0;
783 }
784} 772}
785 773
786unsigned long module_refcount(struct module *mod) 774unsigned long module_refcount(struct module *mod)
@@ -813,21 +801,6 @@ EXPORT_SYMBOL(module_refcount);
813/* This exists whether we can unload or not */ 801/* This exists whether we can unload or not */
814static void free_module(struct module *mod); 802static void free_module(struct module *mod);
815 803
816static void wait_for_zero_refcount(struct module *mod)
817{
818 /* Since we might sleep for some time, release the mutex first */
819 mutex_unlock(&module_mutex);
820 for (;;) {
821 pr_debug("Looking at refcount...\n");
822 set_current_state(TASK_UNINTERRUPTIBLE);
823 if (module_refcount(mod) == 0)
824 break;
825 schedule();
826 }
827 current->state = TASK_RUNNING;
828 mutex_lock(&module_mutex);
829}
830
831SYSCALL_DEFINE2(delete_module, const char __user *, name_user, 804SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
832 unsigned int, flags) 805 unsigned int, flags)
833{ 806{
@@ -842,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
842 return -EFAULT; 815 return -EFAULT;
843 name[MODULE_NAME_LEN-1] = '\0'; 816 name[MODULE_NAME_LEN-1] = '\0';
844 817
818 if (!(flags & O_NONBLOCK)) {
819 printk(KERN_WARNING
820 "waiting module removal not supported: please upgrade");
821 }
822
845 if (mutex_lock_interruptible(&module_mutex) != 0) 823 if (mutex_lock_interruptible(&module_mutex) != 0)
846 return -EINTR; 824 return -EINTR;
847 825
@@ -859,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
859 837
860 /* Doing init or already dying? */ 838 /* Doing init or already dying? */
861 if (mod->state != MODULE_STATE_LIVE) { 839 if (mod->state != MODULE_STATE_LIVE) {
862 /* FIXME: if (force), slam module count and wake up 840 /* FIXME: if (force), slam module count damn the torpedoes */
863 waiter --RR */
864 pr_debug("%s already dying\n", mod->name); 841 pr_debug("%s already dying\n", mod->name);
865 ret = -EBUSY; 842 ret = -EBUSY;
866 goto out; 843 goto out;
@@ -876,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
876 } 853 }
877 } 854 }
878 855
879 /* Set this up before setting mod->state */
880 mod->waiter = current;
881
882 /* Stop the machine so refcounts can't move and disable module. */ 856 /* Stop the machine so refcounts can't move and disable module. */
883 ret = try_stop_module(mod, flags, &forced); 857 ret = try_stop_module(mod, flags, &forced);
884 if (ret != 0) 858 if (ret != 0)
885 goto out; 859 goto out;
886 860
887 /* Never wait if forced. */
888 if (!forced && module_refcount(mod) != 0)
889 wait_for_zero_refcount(mod);
890
891 mutex_unlock(&module_mutex); 861 mutex_unlock(&module_mutex);
892 /* Final destruction now no one is using it. */ 862 /* Final destruction now no one is using it. */
893 if (mod->exit != NULL) 863 if (mod->exit != NULL)
@@ -1005,9 +975,6 @@ void module_put(struct module *module)
1005 __this_cpu_inc(module->refptr->decs); 975 __this_cpu_inc(module->refptr->decs);
1006 976
1007 trace_module_put(module, _RET_IP_); 977 trace_module_put(module, _RET_IP_);
1008 /* Maybe they're waiting for us to drop reference? */
1009 if (unlikely(!module_is_live(module)))
1010 wake_up_process(module->waiter);
1011 preempt_enable(); 978 preempt_enable();
1012 } 979 }
1013} 980}
@@ -1145,8 +1112,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
1145{ 1112{
1146#ifdef CONFIG_MODULE_FORCE_LOAD 1113#ifdef CONFIG_MODULE_FORCE_LOAD
1147 if (!test_taint(TAINT_FORCED_MODULE)) 1114 if (!test_taint(TAINT_FORCED_MODULE))
1148 printk(KERN_WARNING "%s: %s: kernel tainted.\n", 1115 pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
1149 mod->name, reason);
1150 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); 1116 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
1151 return 0; 1117 return 0;
1152#else 1118#else
@@ -1199,8 +1165,7 @@ static int check_version(Elf_Shdr *sechdrs,
1199 goto bad_version; 1165 goto bad_version;
1200 } 1166 }
1201 1167
1202 printk(KERN_WARNING "%s: no symbol version for %s\n", 1168 pr_warn("%s: no symbol version for %s\n", mod->name, symname);
1203 mod->name, symname);
1204 return 0; 1169 return 0;
1205 1170
1206bad_version: 1171bad_version:
@@ -1309,8 +1274,8 @@ resolve_symbol_wait(struct module *mod,
1309 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) 1274 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1310 || PTR_ERR(ksym) != -EBUSY, 1275 || PTR_ERR(ksym) != -EBUSY,
1311 30 * HZ) <= 0) { 1276 30 * HZ) <= 0) {
1312 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1277 pr_warn("%s: gave up waiting for init of module %s.\n",
1313 mod->name, owner); 1278 mod->name, owner);
1314 } 1279 }
1315 return ksym; 1280 return ksym;
1316} 1281}
@@ -1626,15 +1591,14 @@ static int mod_sysfs_init(struct module *mod)
1626 struct kobject *kobj; 1591 struct kobject *kobj;
1627 1592
1628 if (!module_sysfs_initialized) { 1593 if (!module_sysfs_initialized) {
1629 printk(KERN_ERR "%s: module sysfs not initialized\n", 1594 pr_err("%s: module sysfs not initialized\n", mod->name);
1630 mod->name);
1631 err = -EINVAL; 1595 err = -EINVAL;
1632 goto out; 1596 goto out;
1633 } 1597 }
1634 1598
1635 kobj = kset_find_obj(module_kset, mod->name); 1599 kobj = kset_find_obj(module_kset, mod->name);
1636 if (kobj) { 1600 if (kobj) {
1637 printk(KERN_ERR "%s: module is already loaded\n", mod->name); 1601 pr_err("%s: module is already loaded\n", mod->name);
1638 kobject_put(kobj); 1602 kobject_put(kobj);
1639 err = -EINVAL; 1603 err = -EINVAL;
1640 goto out; 1604 goto out;
@@ -1961,8 +1925,7 @@ static int verify_export_symbols(struct module *mod)
1961 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1925 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1962 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { 1926 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1963 if (find_symbol(s->name, &owner, NULL, true, false)) { 1927 if (find_symbol(s->name, &owner, NULL, true, false)) {
1964 printk(KERN_ERR 1928 pr_err("%s: exports duplicate symbol %s"
1965 "%s: exports duplicate symbol %s"
1966 " (owned by %s)\n", 1929 " (owned by %s)\n",
1967 mod->name, s->name, module_name(owner)); 1930 mod->name, s->name, module_name(owner));
1968 return -ENOEXEC; 1931 return -ENOEXEC;
@@ -2013,8 +1976,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
2013 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1976 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
2014 break; 1977 break;
2015 1978
2016 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1979 pr_warn("%s: Unknown symbol %s (err %li)\n",
2017 mod->name, name, PTR_ERR(ksym)); 1980 mod->name, name, PTR_ERR(ksym));
2018 ret = PTR_ERR(ksym) ?: -ENOENT; 1981 ret = PTR_ERR(ksym) ?: -ENOENT;
2019 break; 1982 break;
2020 1983
@@ -2168,8 +2131,8 @@ static void set_license(struct module *mod, const char *license)
2168 2131
2169 if (!license_is_gpl_compatible(license)) { 2132 if (!license_is_gpl_compatible(license)) {
2170 if (!test_taint(TAINT_PROPRIETARY_MODULE)) 2133 if (!test_taint(TAINT_PROPRIETARY_MODULE))
2171 printk(KERN_WARNING "%s: module license '%s' taints " 2134 pr_warn("%s: module license '%s' taints kernel.\n",
2172 "kernel.\n", mod->name, license); 2135 mod->name, license);
2173 add_taint_module(mod, TAINT_PROPRIETARY_MODULE, 2136 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2174 LOCKDEP_NOW_UNRELIABLE); 2137 LOCKDEP_NOW_UNRELIABLE);
2175 } 2138 }
@@ -2405,8 +2368,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2405 return; 2368 return;
2406#ifdef CONFIG_DYNAMIC_DEBUG 2369#ifdef CONFIG_DYNAMIC_DEBUG
2407 if (ddebug_add_module(debug, num, debug->modname)) 2370 if (ddebug_add_module(debug, num, debug->modname))
2408 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2371 pr_err("dynamic debug error adding module: %s\n",
2409 debug->modname); 2372 debug->modname);
2410#endif 2373#endif
2411} 2374}
2412 2375
@@ -2619,8 +2582,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
2619 Elf_Shdr *shdr = &info->sechdrs[i]; 2582 Elf_Shdr *shdr = &info->sechdrs[i];
2620 if (shdr->sh_type != SHT_NOBITS 2583 if (shdr->sh_type != SHT_NOBITS
2621 && info->len < shdr->sh_offset + shdr->sh_size) { 2584 && info->len < shdr->sh_offset + shdr->sh_size) {
2622 printk(KERN_ERR "Module len %lu truncated\n", 2585 pr_err("Module len %lu truncated\n", info->len);
2623 info->len);
2624 return -ENOEXEC; 2586 return -ENOEXEC;
2625 } 2587 }
2626 2588
@@ -2682,15 +2644,14 @@ static struct module *setup_load_info(struct load_info *info, int flags)
2682 2644
2683 info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); 2645 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2684 if (!info->index.mod) { 2646 if (!info->index.mod) {
2685 printk(KERN_WARNING "No module found in object\n"); 2647 pr_warn("No module found in object\n");
2686 return ERR_PTR(-ENOEXEC); 2648 return ERR_PTR(-ENOEXEC);
2687 } 2649 }
2688 /* This is temporary: point mod into copy of data. */ 2650 /* This is temporary: point mod into copy of data. */
2689 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2651 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2690 2652
2691 if (info->index.sym == 0) { 2653 if (info->index.sym == 0) {
2692 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2654 pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
2693 mod->name);
2694 return ERR_PTR(-ENOEXEC); 2655 return ERR_PTR(-ENOEXEC);
2695 } 2656 }
2696 2657
@@ -2717,7 +2678,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2717 if (err) 2678 if (err)
2718 return err; 2679 return err;
2719 } else if (!same_magic(modmagic, vermagic, info->index.vers)) { 2680 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2720 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2681 pr_err("%s: version magic '%s' should be '%s'\n",
2721 mod->name, modmagic, vermagic); 2682 mod->name, modmagic, vermagic);
2722 return -ENOEXEC; 2683 return -ENOEXEC;
2723 } 2684 }
@@ -2727,9 +2688,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2727 2688
2728 if (get_modinfo(info, "staging")) { 2689 if (get_modinfo(info, "staging")) {
2729 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); 2690 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
2730 printk(KERN_WARNING "%s: module is from the staging directory," 2691 pr_warn("%s: module is from the staging directory, the quality "
2731 " the quality is unknown, you have been warned.\n", 2692 "is unknown, you have been warned.\n", mod->name);
2732 mod->name);
2733 } 2693 }
2734 2694
2735 /* Set up license info based on the info section */ 2695 /* Set up license info based on the info section */
@@ -2738,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2738 return 0; 2698 return 0;
2739} 2699}
2740 2700
2741static void find_module_sections(struct module *mod, struct load_info *info) 2701static int find_module_sections(struct module *mod, struct load_info *info)
2742{ 2702{
2743 mod->kp = section_objs(info, "__param", 2703 mod->kp = section_objs(info, "__param",
2744 sizeof(*mod->kp), &mod->num_kp); 2704 sizeof(*mod->kp), &mod->num_kp);
@@ -2768,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2768#ifdef CONFIG_CONSTRUCTORS 2728#ifdef CONFIG_CONSTRUCTORS
2769 mod->ctors = section_objs(info, ".ctors", 2729 mod->ctors = section_objs(info, ".ctors",
2770 sizeof(*mod->ctors), &mod->num_ctors); 2730 sizeof(*mod->ctors), &mod->num_ctors);
2731 if (!mod->ctors)
2732 mod->ctors = section_objs(info, ".init_array",
2733 sizeof(*mod->ctors), &mod->num_ctors);
2734 else if (find_sec(info, ".init_array")) {
2735 /*
2736 * This shouldn't happen with same compiler and binutils
2737 * building all parts of the module.
2738 */
2739 printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
2740 mod->name);
2741 return -EINVAL;
2742 }
2771#endif 2743#endif
2772 2744
2773#ifdef CONFIG_TRACEPOINTS 2745#ifdef CONFIG_TRACEPOINTS
@@ -2801,11 +2773,12 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2801 sizeof(*mod->extable), &mod->num_exentries); 2773 sizeof(*mod->extable), &mod->num_exentries);
2802 2774
2803 if (section_addr(info, "__obsparm")) 2775 if (section_addr(info, "__obsparm"))
2804 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2776 pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
2805 mod->name);
2806 2777
2807 info->debug = section_objs(info, "__verbose", 2778 info->debug = section_objs(info, "__verbose",
2808 sizeof(*info->debug), &info->num_debug); 2779 sizeof(*info->debug), &info->num_debug);
2780
2781 return 0;
2809} 2782}
2810 2783
2811static int move_module(struct module *mod, struct load_info *info) 2784static int move_module(struct module *mod, struct load_info *info)
@@ -3078,11 +3051,10 @@ static int do_init_module(struct module *mod)
3078 return ret; 3051 return ret;
3079 } 3052 }
3080 if (ret > 0) { 3053 if (ret > 0) {
3081 printk(KERN_WARNING 3054 pr_warn("%s: '%s'->init suspiciously returned %d, it should "
3082"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" 3055 "follow 0/-E convention\n"
3083"%s: loading module anyway...\n", 3056 "%s: loading module anyway...\n",
3084 __func__, mod->name, ret, 3057 __func__, mod->name, ret, __func__);
3085 __func__);
3086 dump_stack(); 3058 dump_stack();
3087 } 3059 }
3088 3060
@@ -3205,10 +3177,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname)
3205{ 3177{
3206 /* Check for magic 'dyndbg' arg */ 3178 /* Check for magic 'dyndbg' arg */
3207 int ret = ddebug_dyndbg_module_param_cb(param, val, modname); 3179 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3208 if (ret != 0) { 3180 if (ret != 0)
3209 printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", 3181 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
3210 modname, param);
3211 }
3212 return 0; 3182 return 0;
3213} 3183}
3214 3184
@@ -3243,10 +3213,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3243#ifdef CONFIG_MODULE_SIG 3213#ifdef CONFIG_MODULE_SIG
3244 mod->sig_ok = info->sig_ok; 3214 mod->sig_ok = info->sig_ok;
3245 if (!mod->sig_ok) { 3215 if (!mod->sig_ok) {
3246 printk_once(KERN_NOTICE 3216 pr_notice_once("%s: module verification failed: signature "
3247 "%s: module verification failed: signature and/or" 3217 "and/or required key missing - tainting "
3248 " required key missing - tainting kernel\n", 3218 "kernel\n", mod->name);
3249 mod->name);
3250 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); 3219 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
3251 } 3220 }
3252#endif 3221#endif
@@ -3263,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3263 3232
3264 /* Now we've got everything in the final locations, we can 3233 /* Now we've got everything in the final locations, we can
3265 * find optional sections. */ 3234 * find optional sections. */
3266 find_module_sections(mod, info); 3235 err = find_module_sections(mod, info);
3236 if (err)
3237 goto free_unload;
3267 3238
3268 err = check_module_license_and_versions(mod); 3239 err = check_module_license_and_versions(mod);
3269 if (err) 3240 if (err)
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2970bddc5ea..be5b8fac4bd0 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -14,6 +14,7 @@
14#include <crypto/public_key.h> 14#include <crypto/public_key.h>
15#include <crypto/hash.h> 15#include <crypto/hash.h>
16#include <keys/asymmetric-type.h> 16#include <keys/asymmetric-type.h>
17#include <keys/system_keyring.h>
17#include "module-internal.h" 18#include "module-internal.h"
18 19
19/* 20/*
@@ -28,7 +29,7 @@
28 */ 29 */
29struct module_signature { 30struct module_signature {
30 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ 31 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
31 u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ 32 u8 hash; /* Digest algorithm [enum hash_algo] */
32 u8 id_type; /* Key identifier type [enum pkey_id_type] */ 33 u8 id_type; /* Key identifier type [enum pkey_id_type] */
33 u8 signer_len; /* Length of signer's name */ 34 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */ 35 u8 key_id_len; /* Length of key identifier */
@@ -39,7 +40,7 @@ struct module_signature {
39/* 40/*
40 * Digest the module contents. 41 * Digest the module contents.
41 */ 42 */
42static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, 43static struct public_key_signature *mod_make_digest(enum hash_algo hash,
43 const void *mod, 44 const void *mod,
44 unsigned long modlen) 45 unsigned long modlen)
45{ 46{
@@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
54 /* Allocate the hashing algorithm we're going to need and find out how 55 /* Allocate the hashing algorithm we're going to need and find out how
55 * big the hash operational data will be. 56 * big the hash operational data will be.
56 */ 57 */
57 tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); 58 tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
58 if (IS_ERR(tfm)) 59 if (IS_ERR(tfm))
59 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); 60 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
60 61
@@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
157 158
158 pr_debug("Look up: \"%s\"\n", id); 159 pr_debug("Look up: \"%s\"\n", id);
159 160
160 key = keyring_search(make_key_ref(modsign_keyring, 1), 161 key = keyring_search(make_key_ref(system_trusted_keyring, 1),
161 &key_type_asymmetric, id); 162 &key_type_asymmetric, id);
162 if (IS_ERR(key)) 163 if (IS_ERR(key))
163 pr_warn("Request for unknown module key '%s' err %ld\n", 164 pr_warn("Request for unknown module key '%s' err %ld\n",
@@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
217 return -ENOPKG; 218 return -ENOPKG;
218 219
219 if (ms.hash >= PKEY_HASH__LAST || 220 if (ms.hash >= PKEY_HASH__LAST ||
220 !pkey_hash_algo[ms.hash]) 221 !hash_algo_name[ms.hash])
221 return -ENOPKG; 222 return -ENOPKG;
222 223
223 key = request_asymmetric_key(sig, ms.signer_len, 224 key = request_asymmetric_key(sig, ms.signer_len,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 997cbb951a3b..8e7811086b82 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -126,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
126 struct nsproxy *old_ns = tsk->nsproxy; 126 struct nsproxy *old_ns = tsk->nsproxy;
127 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); 127 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
128 struct nsproxy *new_ns; 128 struct nsproxy *new_ns;
129 int err = 0;
130 129
131 if (!old_ns) 130 if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
131 CLONE_NEWPID | CLONE_NEWNET)))) {
132 get_nsproxy(old_ns);
132 return 0; 133 return 0;
133
134 get_nsproxy(old_ns);
135
136 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
137 CLONE_NEWPID | CLONE_NEWNET)))
138 return 0;
139
140 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
141 err = -EPERM;
142 goto out;
143 } 134 }
144 135
136 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
137 return -EPERM;
138
145 /* 139 /*
146 * CLONE_NEWIPC must detach from the undolist: after switching 140 * CLONE_NEWIPC must detach from the undolist: after switching
147 * to a new ipc namespace, the semaphore arrays from the old 141 * to a new ipc namespace, the semaphore arrays from the old
@@ -149,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
149 * means share undolist with parent, so we must forbid using 143 * means share undolist with parent, so we must forbid using
150 * it along with CLONE_NEWIPC. 144 * it along with CLONE_NEWIPC.
151 */ 145 */
152 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { 146 if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
153 err = -EINVAL; 147 (CLONE_NEWIPC | CLONE_SYSVSEM))
154 goto out; 148 return -EINVAL;
155 }
156 149
157 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); 150 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
158 if (IS_ERR(new_ns)) { 151 if (IS_ERR(new_ns))
159 err = PTR_ERR(new_ns); 152 return PTR_ERR(new_ns);
160 goto out;
161 }
162 153
163 tsk->nsproxy = new_ns; 154 tsk->nsproxy = new_ns;
164 155 return 0;
165out:
166 put_nsproxy(old_ns);
167 return err;
168} 156}
169 157
170void free_nsproxy(struct nsproxy *ns) 158void free_nsproxy(struct nsproxy *ns)
diff --git a/kernel/panic.c b/kernel/panic.c
index 801864600514..c00b4ceb39e8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...)
123 */ 123 */
124 smp_send_stop(); 124 smp_send_stop();
125 125
126 kmsg_dump(KMSG_DUMP_PANIC); 126 /*
127 127 * Run any panic handlers, including those that might need to
128 * add information to the kmsg dump output.
129 */
128 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 130 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
129 131
132 kmsg_dump(KMSG_DUMP_PANIC);
133
130 bust_spinlocks(0); 134 bust_spinlocks(0);
131 135
132 if (!panic_blink) 136 if (!panic_blink)
@@ -229,7 +233,7 @@ static const struct tnt tnts[] = {
229 */ 233 */
230const char *print_tainted(void) 234const char *print_tainted(void)
231{ 235{
232 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; 236 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
233 237
234 if (tainted_mask) { 238 if (tainted_mask) {
235 char *s; 239 char *s;
diff --git a/kernel/params.c b/kernel/params.c
index 501bde4f3bee..c00d5b502aa4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -253,13 +253,13 @@ int parse_args(const char *doing,
253 EXPORT_SYMBOL(param_ops_##name) 253 EXPORT_SYMBOL(param_ops_##name)
254 254
255 255
256STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); 256STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
257STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 257STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
258STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); 258STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
259STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); 259STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
260STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); 260STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
261STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 261STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
262STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 262STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
263 263
264int param_set_charp(const char *val, const struct kernel_param *kp) 264int param_set_charp(const char *val, const struct kernel_param *kp)
265{ 265{
diff --git a/kernel/pid.c b/kernel/pid.c
index 66505c1dfc51..9b9a26698144 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -265,6 +265,7 @@ void free_pid(struct pid *pid)
265 struct pid_namespace *ns = upid->ns; 265 struct pid_namespace *ns = upid->ns;
266 hlist_del_rcu(&upid->pid_chain); 266 hlist_del_rcu(&upid->pid_chain);
267 switch(--ns->nr_hashed) { 267 switch(--ns->nr_hashed) {
268 case 2:
268 case 1: 269 case 1:
269 /* When all that is left in the pid namespace 270 /* When all that is left in the pid namespace
270 * is the reaper wake up the reaper. The reaper 271 * is the reaper wake up the reaper. The reaper
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid)
272 */ 273 */
273 wake_up_process(ns->child_reaper); 274 wake_up_process(ns->child_reaper);
274 break; 275 break;
276 case PIDNS_HASH_ADDING:
277 /* Handle a fork failure of the first process */
278 WARN_ON(ns->child_reaper);
279 ns->nr_hashed = 0;
280 /* fall through */
275 case 0: 281 case 0:
276 schedule_work(&ns->proc_work); 282 schedule_work(&ns->proc_work);
277 break; 283 break;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 601bb361c235..06c62de9c711 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -132,6 +132,12 @@ out:
132 return ERR_PTR(err); 132 return ERR_PTR(err);
133} 133}
134 134
135static void delayed_free_pidns(struct rcu_head *p)
136{
137 kmem_cache_free(pid_ns_cachep,
138 container_of(p, struct pid_namespace, rcu));
139}
140
135static void destroy_pid_namespace(struct pid_namespace *ns) 141static void destroy_pid_namespace(struct pid_namespace *ns)
136{ 142{
137 int i; 143 int i;
@@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
140 for (i = 0; i < PIDMAP_ENTRIES; i++) 146 for (i = 0; i < PIDMAP_ENTRIES; i++)
141 kfree(ns->pidmap[i].page); 147 kfree(ns->pidmap[i].page);
142 put_user_ns(ns->user_ns); 148 put_user_ns(ns->user_ns);
143 kmem_cache_free(pid_ns_cachep, ns); 149 call_rcu(&ns->rcu, delayed_free_pidns);
144} 150}
145 151
146struct pid_namespace *copy_pid_ns(unsigned long flags, 152struct pid_namespace *copy_pid_ns(unsigned long flags,
@@ -329,7 +335,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
329 struct pid_namespace *ancestor, *new = ns; 335 struct pid_namespace *ancestor, *new = ns;
330 336
331 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || 337 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
332 !nsown_capable(CAP_SYS_ADMIN)) 338 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
333 return -EPERM; 339 return -EPERM;
334 340
335 /* 341 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d444c4e834f4..2fac9cc79b3d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG
178 def_bool y 178 def_bool y
179 depends on PM_DEBUG && PM_SLEEP 179 depends on PM_DEBUG && PM_SLEEP
180 180
181config DPM_WATCHDOG
182 bool "Device suspend/resume watchdog"
183 depends on PM_DEBUG && PSTORE
184 ---help---
185 Sets up a watchdog timer to capture drivers that are
186 locked up attempting to suspend/resume a device.
187 A detected lockup causes system panic with message
188 captured in pstore device for inspection in subsequent
189 boot session.
190
191config DPM_WATCHDOG_TIMEOUT
192 int "Watchdog timeout in seconds"
193 range 1 120
194 default 12
195 depends on DPM_WATCHDOG
196
181config PM_TRACE 197config PM_TRACE
182 bool 198 bool
183 help 199 help
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 3085e62a80a5..0121dab83f43 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -644,22 +644,23 @@ int hibernate(void)
644 if (error) 644 if (error)
645 goto Exit; 645 goto Exit;
646 646
647 /* Allocate memory management structures */
648 error = create_basic_memory_bitmaps();
649 if (error)
650 goto Exit;
651
652 printk(KERN_INFO "PM: Syncing filesystems ... "); 647 printk(KERN_INFO "PM: Syncing filesystems ... ");
653 sys_sync(); 648 sys_sync();
654 printk("done.\n"); 649 printk("done.\n");
655 650
656 error = freeze_processes(); 651 error = freeze_processes();
657 if (error) 652 if (error)
658 goto Free_bitmaps; 653 goto Exit;
654
655 lock_device_hotplug();
656 /* Allocate memory management structures */
657 error = create_basic_memory_bitmaps();
658 if (error)
659 goto Thaw;
659 660
660 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 661 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
661 if (error || freezer_test_done) 662 if (error || freezer_test_done)
662 goto Thaw; 663 goto Free_bitmaps;
663 664
664 if (in_suspend) { 665 if (in_suspend) {
665 unsigned int flags = 0; 666 unsigned int flags = 0;
@@ -682,14 +683,14 @@ int hibernate(void)
682 pr_debug("PM: Image restored successfully.\n"); 683 pr_debug("PM: Image restored successfully.\n");
683 } 684 }
684 685
686 Free_bitmaps:
687 free_basic_memory_bitmaps();
685 Thaw: 688 Thaw:
689 unlock_device_hotplug();
686 thaw_processes(); 690 thaw_processes();
687 691
688 /* Don't bother checking whether freezer_test_done is true */ 692 /* Don't bother checking whether freezer_test_done is true */
689 freezer_test_done = false; 693 freezer_test_done = false;
690
691 Free_bitmaps:
692 free_basic_memory_bitmaps();
693 Exit: 694 Exit:
694 pm_notifier_call_chain(PM_POST_HIBERNATION); 695 pm_notifier_call_chain(PM_POST_HIBERNATION);
695 pm_restore_console(); 696 pm_restore_console();
@@ -806,21 +807,20 @@ static int software_resume(void)
806 pm_prepare_console(); 807 pm_prepare_console();
807 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 808 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
808 if (error) 809 if (error)
809 goto close_finish; 810 goto Close_Finish;
810
811 error = create_basic_memory_bitmaps();
812 if (error)
813 goto close_finish;
814 811
815 pr_debug("PM: Preparing processes for restore.\n"); 812 pr_debug("PM: Preparing processes for restore.\n");
816 error = freeze_processes(); 813 error = freeze_processes();
817 if (error) { 814 if (error)
818 swsusp_close(FMODE_READ); 815 goto Close_Finish;
819 goto Done;
820 }
821 816
822 pr_debug("PM: Loading hibernation image.\n"); 817 pr_debug("PM: Loading hibernation image.\n");
823 818
819 lock_device_hotplug();
820 error = create_basic_memory_bitmaps();
821 if (error)
822 goto Thaw;
823
824 error = swsusp_read(&flags); 824 error = swsusp_read(&flags);
825 swsusp_close(FMODE_READ); 825 swsusp_close(FMODE_READ);
826 if (!error) 826 if (!error)
@@ -828,9 +828,10 @@ static int software_resume(void)
828 828
829 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); 829 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
830 swsusp_free(); 830 swsusp_free();
831 thaw_processes();
832 Done:
833 free_basic_memory_bitmaps(); 831 free_basic_memory_bitmaps();
832 Thaw:
833 unlock_device_hotplug();
834 thaw_processes();
834 Finish: 835 Finish:
835 pm_notifier_call_chain(PM_POST_RESTORE); 836 pm_notifier_call_chain(PM_POST_RESTORE);
836 pm_restore_console(); 837 pm_restore_console();
@@ -840,12 +841,12 @@ static int software_resume(void)
840 mutex_unlock(&pm_mutex); 841 mutex_unlock(&pm_mutex);
841 pr_debug("PM: Hibernation image not present or could not be loaded.\n"); 842 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
842 return error; 843 return error;
843close_finish: 844 Close_Finish:
844 swsusp_close(FMODE_READ); 845 swsusp_close(FMODE_READ);
845 goto Finish; 846 goto Finish;
846} 847}
847 848
848late_initcall(software_resume); 849late_initcall_sync(software_resume);
849 850
850 851
851static const char * const hibernation_modes[] = { 852static const char * const hibernation_modes[] = {
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index a394297f8b2f..8dff9b48075a 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
558 if (count == sizeof(s32)) { 558 if (count == sizeof(s32)) {
559 if (copy_from_user(&value, buf, sizeof(s32))) 559 if (copy_from_user(&value, buf, sizeof(s32)))
560 return -EFAULT; 560 return -EFAULT;
561 } else if (count <= 11) { /* ASCII perhaps? */ 561 } else {
562 char ascii_value[11];
563 unsigned long int ulval;
564 int ret; 562 int ret;
565 563
566 if (copy_from_user(ascii_value, buf, count)) 564 ret = kstrtos32_from_user(buf, count, 16, &value);
567 return -EFAULT; 565 if (ret)
568 566 return ret;
569 if (count > 10) {
570 if (ascii_value[10] == '\n')
571 ascii_value[10] = '\0';
572 else
573 return -EINVAL;
574 } else {
575 ascii_value[count] = '\0';
576 }
577 ret = kstrtoul(ascii_value, 16, &ulval);
578 if (ret) {
579 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
580 return -EINVAL;
581 }
582 value = (s32)lower_32_bits(ulval);
583 } else {
584 return -EINVAL;
585 } 567 }
586 568
587 req = filp->private_data; 569 req = filp->private_data;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 349587bb03e1..b38109e204af 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
352 struct mem_extent *ext, *cur, *aux; 352 struct mem_extent *ext, *cur, *aux;
353 353
354 zone_start = zone->zone_start_pfn; 354 zone_start = zone->zone_start_pfn;
355 zone_end = zone->zone_start_pfn + zone->spanned_pages; 355 zone_end = zone_end_pfn(zone);
356 356
357 list_for_each_entry(ext, list, hook) 357 list_for_each_entry(ext, list, hook)
358 if (zone_start <= ext->end) 358 if (zone_start <= ext->end)
@@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void)
743 struct memory_bitmap *bm1, *bm2; 743 struct memory_bitmap *bm1, *bm2;
744 int error = 0; 744 int error = 0;
745 745
746 BUG_ON(forbidden_pages_map || free_pages_map); 746 if (forbidden_pages_map && free_pages_map)
747 return 0;
748 else
749 BUG_ON(forbidden_pages_map || free_pages_map);
747 750
748 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); 751 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
749 if (!bm1) 752 if (!bm1)
@@ -789,7 +792,8 @@ void free_basic_memory_bitmaps(void)
789{ 792{
790 struct memory_bitmap *bm1, *bm2; 793 struct memory_bitmap *bm1, *bm2;
791 794
792 BUG_ON(!(forbidden_pages_map && free_pages_map)); 795 if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
796 return;
793 797
794 bm1 = forbidden_pages_map; 798 bm1 = forbidden_pages_map;
795 bm2 = free_pages_map; 799 bm2 = free_pages_map;
@@ -884,7 +888,7 @@ static unsigned int count_highmem_pages(void)
884 continue; 888 continue;
885 889
886 mark_free_pages(zone); 890 mark_free_pages(zone);
887 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 891 max_zone_pfn = zone_end_pfn(zone);
888 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 892 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
889 if (saveable_highmem_page(zone, pfn)) 893 if (saveable_highmem_page(zone, pfn))
890 n++; 894 n++;
@@ -948,7 +952,7 @@ static unsigned int count_data_pages(void)
948 continue; 952 continue;
949 953
950 mark_free_pages(zone); 954 mark_free_pages(zone);
951 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 955 max_zone_pfn = zone_end_pfn(zone);
952 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 956 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
953 if (saveable_page(zone, pfn)) 957 if (saveable_page(zone, pfn))
954 n++; 958 n++;
@@ -1041,7 +1045,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1041 unsigned long max_zone_pfn; 1045 unsigned long max_zone_pfn;
1042 1046
1043 mark_free_pages(zone); 1047 mark_free_pages(zone);
1044 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1048 max_zone_pfn = zone_end_pfn(zone);
1045 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1049 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1046 if (page_is_saveable(zone, pfn)) 1050 if (page_is_saveable(zone, pfn))
1047 memory_bm_set_bit(orig_bm, pfn); 1051 memory_bm_set_bit(orig_bm, pfn);
@@ -1093,7 +1097,7 @@ void swsusp_free(void)
1093 unsigned long pfn, max_zone_pfn; 1097 unsigned long pfn, max_zone_pfn;
1094 1098
1095 for_each_populated_zone(zone) { 1099 for_each_populated_zone(zone) {
1096 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1100 max_zone_pfn = zone_end_pfn(zone);
1097 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1101 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1098 if (pfn_valid(pfn)) { 1102 if (pfn_valid(pfn)) {
1099 struct page *page = pfn_to_page(pfn); 1103 struct page *page = pfn_to_page(pfn);
@@ -1399,7 +1403,11 @@ int hibernate_preallocate_memory(void)
1399 * highmem and non-highmem zones separately. 1403 * highmem and non-highmem zones separately.
1400 */ 1404 */
1401 pages_highmem = preallocate_image_highmem(highmem / 2); 1405 pages_highmem = preallocate_image_highmem(highmem / 2);
1402 alloc = (count - max_size) - pages_highmem; 1406 alloc = count - max_size;
1407 if (alloc > pages_highmem)
1408 alloc -= pages_highmem;
1409 else
1410 alloc = 0;
1403 pages = preallocate_image_memory(alloc, avail_normal); 1411 pages = preallocate_image_memory(alloc, avail_normal);
1404 if (pages < alloc) { 1412 if (pages < alloc) {
1405 /* We have exhausted non-highmem pages, try highmem. */ 1413 /* We have exhausted non-highmem pages, try highmem. */
@@ -1755,7 +1763,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1755 1763
1756 /* Clear page flags */ 1764 /* Clear page flags */
1757 for_each_populated_zone(zone) { 1765 for_each_populated_zone(zone) {
1758 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1766 max_zone_pfn = zone_end_pfn(zone);
1759 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1767 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1760 if (pfn_valid(pfn)) 1768 if (pfn_valid(pfn))
1761 swsusp_unset_page_free(pfn_to_page(pfn)); 1769 swsusp_unset_page_free(pfn_to_page(pfn));
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86f..98d357584cd6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -36,9 +36,10 @@ static struct snapshot_data {
36 struct snapshot_handle handle; 36 struct snapshot_handle handle;
37 int swap; 37 int swap;
38 int mode; 38 int mode;
39 char frozen; 39 bool frozen;
40 char ready; 40 bool ready;
41 char platform_support; 41 bool platform_support;
42 bool free_bitmaps;
42} snapshot_state; 43} snapshot_state;
43 44
44atomic_t snapshot_device_available = ATOMIC_INIT(1); 45atomic_t snapshot_device_available = ATOMIC_INIT(1);
@@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
60 error = -ENOSYS; 61 error = -ENOSYS;
61 goto Unlock; 62 goto Unlock;
62 } 63 }
63 if(create_basic_memory_bitmaps()) {
64 atomic_inc(&snapshot_device_available);
65 error = -ENOMEM;
66 goto Unlock;
67 }
68 nonseekable_open(inode, filp); 64 nonseekable_open(inode, filp);
69 data = &snapshot_state; 65 data = &snapshot_state;
70 filp->private_data = data; 66 filp->private_data = data;
@@ -74,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
74 data->swap = swsusp_resume_device ? 70 data->swap = swsusp_resume_device ?
75 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 71 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
76 data->mode = O_RDONLY; 72 data->mode = O_RDONLY;
73 data->free_bitmaps = false;
77 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 74 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
78 if (error) 75 if (error)
79 pm_notifier_call_chain(PM_POST_HIBERNATION); 76 pm_notifier_call_chain(PM_POST_HIBERNATION);
@@ -87,16 +84,19 @@ static int snapshot_open(struct inode *inode, struct file *filp)
87 data->swap = -1; 84 data->swap = -1;
88 data->mode = O_WRONLY; 85 data->mode = O_WRONLY;
89 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 86 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
87 if (!error) {
88 error = create_basic_memory_bitmaps();
89 data->free_bitmaps = !error;
90 }
90 if (error) 91 if (error)
91 pm_notifier_call_chain(PM_POST_RESTORE); 92 pm_notifier_call_chain(PM_POST_RESTORE);
92 } 93 }
93 if (error) { 94 if (error)
94 free_basic_memory_bitmaps();
95 atomic_inc(&snapshot_device_available); 95 atomic_inc(&snapshot_device_available);
96 } 96
97 data->frozen = 0; 97 data->frozen = false;
98 data->ready = 0; 98 data->ready = false;
99 data->platform_support = 0; 99 data->platform_support = false;
100 100
101 Unlock: 101 Unlock:
102 unlock_system_sleep(); 102 unlock_system_sleep();
@@ -111,12 +111,14 @@ static int snapshot_release(struct inode *inode, struct file *filp)
111 lock_system_sleep(); 111 lock_system_sleep();
112 112
113 swsusp_free(); 113 swsusp_free();
114 free_basic_memory_bitmaps();
115 data = filp->private_data; 114 data = filp->private_data;
116 free_all_swap_pages(data->swap); 115 free_all_swap_pages(data->swap);
117 if (data->frozen) { 116 if (data->frozen) {
118 pm_restore_gfp_mask(); 117 pm_restore_gfp_mask();
118 free_basic_memory_bitmaps();
119 thaw_processes(); 119 thaw_processes();
120 } else if (data->free_bitmaps) {
121 free_basic_memory_bitmaps();
120 } 122 }
121 pm_notifier_call_chain(data->mode == O_RDONLY ? 123 pm_notifier_call_chain(data->mode == O_RDONLY ?
122 PM_POST_HIBERNATION : PM_POST_RESTORE); 124 PM_POST_HIBERNATION : PM_POST_RESTORE);
@@ -207,6 +209,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
207 if (!mutex_trylock(&pm_mutex)) 209 if (!mutex_trylock(&pm_mutex))
208 return -EBUSY; 210 return -EBUSY;
209 211
212 lock_device_hotplug();
210 data = filp->private_data; 213 data = filp->private_data;
211 214
212 switch (cmd) { 215 switch (cmd) {
@@ -220,16 +223,25 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
220 printk("done.\n"); 223 printk("done.\n");
221 224
222 error = freeze_processes(); 225 error = freeze_processes();
223 if (!error) 226 if (error)
224 data->frozen = 1; 227 break;
228
229 error = create_basic_memory_bitmaps();
230 if (error)
231 thaw_processes();
232 else
233 data->frozen = true;
234
225 break; 235 break;
226 236
227 case SNAPSHOT_UNFREEZE: 237 case SNAPSHOT_UNFREEZE:
228 if (!data->frozen || data->ready) 238 if (!data->frozen || data->ready)
229 break; 239 break;
230 pm_restore_gfp_mask(); 240 pm_restore_gfp_mask();
241 free_basic_memory_bitmaps();
242 data->free_bitmaps = false;
231 thaw_processes(); 243 thaw_processes();
232 data->frozen = 0; 244 data->frozen = false;
233 break; 245 break;
234 246
235 case SNAPSHOT_CREATE_IMAGE: 247 case SNAPSHOT_CREATE_IMAGE:
@@ -259,7 +271,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
259 case SNAPSHOT_FREE: 271 case SNAPSHOT_FREE:
260 swsusp_free(); 272 swsusp_free();
261 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 273 memset(&data->handle, 0, sizeof(struct snapshot_handle));
262 data->ready = 0; 274 data->ready = false;
263 /* 275 /*
264 * It is necessary to thaw kernel threads here, because 276 * It is necessary to thaw kernel threads here, because
265 * SNAPSHOT_CREATE_IMAGE may be invoked directly after 277 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
@@ -323,7 +335,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
323 * PM_HIBERNATION_PREPARE 335 * PM_HIBERNATION_PREPARE
324 */ 336 */
325 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 337 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
326 data->ready = 0; 338 data->ready = false;
327 break; 339 break;
328 340
329 case SNAPSHOT_PLATFORM_SUPPORT: 341 case SNAPSHOT_PLATFORM_SUPPORT:
@@ -371,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
371 383
372 } 384 }
373 385
386 unlock_device_hotplug();
374 mutex_unlock(&pm_mutex); 387 mutex_unlock(&pm_mutex);
375 388
376 return error; 389 return error;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b4e8500afdb3..be7c86bae576 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = {
705 705
706#ifdef CONFIG_KEXEC 706#ifdef CONFIG_KEXEC
707/* 707/*
708 * This appends the listed symbols to /proc/vmcoreinfo 708 * This appends the listed symbols to /proc/vmcore
709 * 709 *
710 * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to 710 * /proc/vmcore is used by various utilities, like crash and makedumpfile to
711 * obtain access to symbols that are otherwise very difficult to locate. These 711 * obtain access to symbols that are otherwise very difficult to locate. These
712 * symbols are specifically used so that utilities can access and extract the 712 * symbols are specifically used so that utilities can access and extract the
713 * dmesg log from a vmcore file after a crash. 713 * dmesg log from a vmcore file after a crash.
@@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel;
791static int __init ignore_loglevel_setup(char *str) 791static int __init ignore_loglevel_setup(char *str)
792{ 792{
793 ignore_loglevel = 1; 793 ignore_loglevel = 1;
794 printk(KERN_INFO "debug: ignoring loglevel setting.\n"); 794 pr_info("debug: ignoring loglevel setting.\n");
795 795
796 return 0; 796 return 0;
797} 797}
@@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str)
820 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 820 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
821 "HZ: %d, loops_per_msec: %llu\n", 821 "HZ: %d, loops_per_msec: %llu\n",
822 boot_delay, preset_lpj, lpj, HZ, loops_per_msec); 822 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
823 return 1; 823 return 0;
824} 824}
825__setup("boot_delay=", boot_delay_setup); 825early_param("boot_delay", boot_delay_setup);
826 826
827static void boot_delay_msec(int level) 827static void boot_delay_msec(int level)
828{ 828{
@@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon;
2193static int __init keep_bootcon_setup(char *str) 2193static int __init keep_bootcon_setup(char *str)
2194{ 2194{
2195 keep_bootcon = 1; 2195 keep_bootcon = 1;
2196 printk(KERN_INFO "debug: skip boot console de-registration.\n"); 2196 pr_info("debug: skip boot console de-registration.\n");
2197 2197
2198 return 0; 2198 return 0;
2199} 2199}
@@ -2241,7 +2241,7 @@ void register_console(struct console *newcon)
2241 /* find the last or real console */ 2241 /* find the last or real console */
2242 for_each_console(bcon) { 2242 for_each_console(bcon) {
2243 if (!(bcon->flags & CON_BOOT)) { 2243 if (!(bcon->flags & CON_BOOT)) {
2244 printk(KERN_INFO "Too late to register bootconsole %s%d\n", 2244 pr_info("Too late to register bootconsole %s%d\n",
2245 newcon->name, newcon->index); 2245 newcon->name, newcon->index);
2246 return; 2246 return;
2247 } 2247 }
@@ -2358,21 +2358,18 @@ void register_console(struct console *newcon)
2358 * users know there might be something in the kernel's log buffer that 2358 * users know there might be something in the kernel's log buffer that
2359 * went to the bootconsole (that they do not see on the real console) 2359 * went to the bootconsole (that they do not see on the real console)
2360 */ 2360 */
2361 pr_info("%sconsole [%s%d] enabled\n",
2362 (newcon->flags & CON_BOOT) ? "boot" : "" ,
2363 newcon->name, newcon->index);
2361 if (bcon && 2364 if (bcon &&
2362 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && 2365 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
2363 !keep_bootcon) { 2366 !keep_bootcon) {
2364 /* we need to iterate through twice, to make sure we print 2367 /* We need to iterate through all boot consoles, to make
2365 * everything out, before we unregister the console(s) 2368 * sure we print everything out, before we unregister them.
2366 */ 2369 */
2367 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
2368 newcon->name, newcon->index);
2369 for_each_console(bcon) 2370 for_each_console(bcon)
2370 if (bcon->flags & CON_BOOT) 2371 if (bcon->flags & CON_BOOT)
2371 unregister_console(bcon); 2372 unregister_console(bcon);
2372 } else {
2373 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
2374 (newcon->flags & CON_BOOT) ? "boot" : "" ,
2375 newcon->name, newcon->index);
2376 } 2373 }
2377} 2374}
2378EXPORT_SYMBOL(register_console); 2375EXPORT_SYMBOL(register_console);
@@ -2382,6 +2379,10 @@ int unregister_console(struct console *console)
2382 struct console *a, *b; 2379 struct console *a, *b;
2383 int res; 2380 int res;
2384 2381
2382 pr_info("%sconsole [%s%d] disabled\n",
2383 (console->flags & CON_BOOT) ? "boot" : "" ,
2384 console->name, console->index);
2385
2385 res = _braille_unregister_console(console); 2386 res = _braille_unregister_console(console);
2386 if (res) 2387 if (res)
2387 return res; 2388 return res;
@@ -2421,8 +2422,6 @@ static int __init printk_late_init(void)
2421 2422
2422 for_each_console(con) { 2423 for_each_console(con) {
2423 if (!keep_bootcon && con->flags & CON_BOOT) { 2424 if (!keep_bootcon && con->flags & CON_BOOT) {
2424 printk(KERN_INFO "turn off boot console %s%d\n",
2425 con->name, con->index);
2426 unregister_console(con); 2425 unregister_console(con);
2427 } 2426 }
2428 } 2427 }
@@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
2449 2448
2450 if (pending & PRINTK_PENDING_SCHED) { 2449 if (pending & PRINTK_PENDING_SCHED) {
2451 char *buf = __get_cpu_var(printk_sched_buf); 2450 char *buf = __get_cpu_var(printk_sched_buf);
2452 printk(KERN_WARNING "[sched_delayed] %s", buf); 2451 pr_warn("[sched_delayed] %s", buf);
2453 } 2452 }
2454 2453
2455 if (pending & PRINTK_PENDING_WAKEUP) 2454 if (pending & PRINTK_PENDING_WAKEUP)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee327f6a..1f4bcb3cc21c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
236 */ 236 */
237 int dumpable = 0; 237 int dumpable = 0;
238 /* Don't let security modules deny introspection */ 238 /* Don't let security modules deny introspection */
239 if (task == current) 239 if (same_thread_group(task, current))
240 return 0; 240 return 0;
241 rcu_read_lock(); 241 rcu_read_lock();
242 tcred = __task_cred(task); 242 tcred = __task_cred(task);
@@ -257,7 +257,8 @@ ok:
257 if (task->mm) 257 if (task->mm)
258 dumpable = get_dumpable(task->mm); 258 dumpable = get_dumpable(task->mm);
259 rcu_read_lock(); 259 rcu_read_lock();
260 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { 260 if (dumpable != SUID_DUMP_USER &&
261 !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
261 rcu_read_unlock(); 262 rcu_read_unlock();
262 return -EPERM; 263 return -EPERM;
263 } 264 }
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000000..01e9ec37a3e3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index 77131966c4ad..7859a0a3951e 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
122 122
123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
124 124
125/*
126 * Strings used in tracepoints need to be exported via the
127 * tracing system such that tools like perf and trace-cmd can
128 * translate the string address pointers to actual text.
129 */
130#define TPS(x) tracepoint_string(x)
131
125#endif /* __LINUX_RCU_H */ 132#endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index 9ed6075dc562..1254f312d024 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/ftrace_event.h>
38 39
39#ifdef CONFIG_RCU_TRACE 40#ifdef CONFIG_RCU_TRACE
40#include <trace/events/rcu.h> 41#include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
42 43
43#include "rcu.h" 44#include "rcu.h"
44 45
45/* Forward declarations for rcutiny_plugin.h. */ 46/* Forward declarations for tiny_plugin.h. */
46struct rcu_ctrlblk; 47struct rcu_ctrlblk;
47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static void rcu_process_callbacks(struct softirq_action *unused); 49static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
52 53
53static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
54 55
55#include "rcutiny_plugin.h" 56#include "tiny_plugin.h"
56 57
57/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
58static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
59{ 60{
60 if (newval) { 61 if (newval) {
61 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
62 rcu_dynticks_nesting, newval)); 63 rcu_dynticks_nesting, newval));
63 rcu_dynticks_nesting = newval; 64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); 67 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
68 rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 69 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 70 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
69 71
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 72 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
71 rcu_dynticks_nesting, newval)); 73 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 74 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 75 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
120static void rcu_idle_exit_common(long long oldval) 122static void rcu_idle_exit_common(long long oldval)
121{ 123{
122 if (oldval) { 124 if (oldval) {
123 RCU_TRACE(trace_rcu_dyntick("++=", 125 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
124 oldval, rcu_dynticks_nesting)); 126 oldval, rcu_dynticks_nesting));
125 return; 127 return;
126 } 128 }
127 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); 129 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
128 if (!is_idle_task(current)) { 130 if (!is_idle_task(current)) {
129 struct task_struct *idle = idle_task(smp_processor_id()); 131 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
130 132
131 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", 133 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
132 oldval, rcu_dynticks_nesting)); 134 oldval, rcu_dynticks_nesting));
133 ftrace_dump(DUMP_ALL); 135 ftrace_dump(DUMP_ALL);
134 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 136 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
174} 176}
175EXPORT_SYMBOL_GPL(rcu_irq_enter); 177EXPORT_SYMBOL_GPL(rcu_irq_enter);
176 178
177#ifdef CONFIG_DEBUG_LOCK_ALLOC 179#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
178 180
179/* 181/*
180 * Test whether RCU thinks that the current CPU is idle. 182 * Test whether RCU thinks that the current CPU is idle.
181 */ 183 */
182int rcu_is_cpu_idle(void) 184bool notrace __rcu_is_watching(void)
183{ 185{
184 return !rcu_dynticks_nesting; 186 return rcu_dynticks_nesting;
185} 187}
186EXPORT_SYMBOL(rcu_is_cpu_idle); 188EXPORT_SYMBOL(__rcu_is_watching);
187 189
188#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 190#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
189 191
190/* 192/*
191 * Test whether the current CPU was interrupted from idle. Nested 193 * Test whether the current CPU was interrupted from idle. Nested
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
273 if (&rcp->rcucblist == rcp->donetail) { 275 if (&rcp->rcucblist == rcp->donetail) {
274 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); 276 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
275 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 277 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
276 ACCESS_ONCE(rcp->rcucblist), 278 !!ACCESS_ONCE(rcp->rcucblist),
277 need_resched(), 279 need_resched(),
278 is_idle_task(current), 280 is_idle_task(current),
279 false)); 281 false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
304 RCU_TRACE(cb_count++); 306 RCU_TRACE(cb_count++);
305 } 307 }
306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 308 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 309 RCU_TRACE(trace_rcu_batch_end(rcp->name,
310 cb_count, 0, need_resched(),
308 is_idle_task(current), 311 is_idle_task(current),
309 false)); 312 false));
310} 313}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c
index be63101c6175..3929cd451511 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,6 +52,12 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55MODULE_ALIAS("rcutorture");
56#ifdef MODULE_PARAM_PREFIX
57#undef MODULE_PARAM_PREFIX
58#endif
59#define MODULE_PARAM_PREFIX "rcutorture."
60
55static int fqs_duration; 61static int fqs_duration;
56module_param(fqs_duration, int, 0444); 62module_param(fqs_duration, int, 0444);
57MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); 63MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c
index 32618b3fe4e6..dd081987a8ec 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
41#include <linux/export.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/module.h>
44#include <linux/percpu.h> 45#include <linux/percpu.h>
45#include <linux/notifier.h> 46#include <linux/notifier.h>
46#include <linux/cpu.h> 47#include <linux/cpu.h>
@@ -56,17 +57,16 @@
56#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
57#include <linux/suspend.h> 58#include <linux/suspend.h>
58 59
59#include "rcutree.h" 60#include "tree.h"
60#include <trace/events/rcu.h> 61#include <trace/events/rcu.h>
61 62
62#include "rcu.h" 63#include "rcu.h"
63 64
64/* 65MODULE_ALIAS("rcutree");
65 * Strings used in tracepoints need to be exported via the 66#ifdef MODULE_PARAM_PREFIX
66 * tracing system such that tools like perf and trace-cmd can 67#undef MODULE_PARAM_PREFIX
67 * translate the string address pointers to actual text. 68#endif
68 */ 69#define MODULE_PARAM_PREFIX "rcutree."
69#define TPS(x) tracepoint_string(x)
70 70
71/* Data structures. */ 71/* Data structures. */
72 72
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)
222} 222}
223EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
224 224
225DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
227 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
@@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
371{ 371{
372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
373 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
374 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle __maybe_unused =
375 idle_task(smp_processor_id());
375 376
376 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); 377 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
377 ftrace_dump(DUMP_ORIG); 378 ftrace_dump(DUMP_ORIG);
@@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user)
407 long long oldval; 408 long long oldval;
408 struct rcu_dynticks *rdtp; 409 struct rcu_dynticks *rdtp;
409 410
410 rdtp = &__get_cpu_var(rcu_dynticks); 411 rdtp = this_cpu_ptr(&rcu_dynticks);
411 oldval = rdtp->dynticks_nesting; 412 oldval = rdtp->dynticks_nesting;
412 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 413 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
413 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 414 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -435,7 +436,7 @@ void rcu_idle_enter(void)
435 436
436 local_irq_save(flags); 437 local_irq_save(flags);
437 rcu_eqs_enter(false); 438 rcu_eqs_enter(false);
438 rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); 439 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
439 local_irq_restore(flags); 440 local_irq_restore(flags);
440} 441}
441EXPORT_SYMBOL_GPL(rcu_idle_enter); 442EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -478,7 +479,7 @@ void rcu_irq_exit(void)
478 struct rcu_dynticks *rdtp; 479 struct rcu_dynticks *rdtp;
479 480
480 local_irq_save(flags); 481 local_irq_save(flags);
481 rdtp = &__get_cpu_var(rcu_dynticks); 482 rdtp = this_cpu_ptr(&rcu_dynticks);
482 oldval = rdtp->dynticks_nesting; 483 oldval = rdtp->dynticks_nesting;
483 rdtp->dynticks_nesting--; 484 rdtp->dynticks_nesting--;
484 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 485 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
@@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
508 rcu_cleanup_after_idle(smp_processor_id()); 509 rcu_cleanup_after_idle(smp_processor_id());
509 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 510 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
510 if (!user && !is_idle_task(current)) { 511 if (!user && !is_idle_task(current)) {
511 struct task_struct *idle = idle_task(smp_processor_id()); 512 struct task_struct *idle __maybe_unused =
513 idle_task(smp_processor_id());
512 514
513 trace_rcu_dyntick(TPS("Error on exit: not idle task"), 515 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
514 oldval, rdtp->dynticks_nesting); 516 oldval, rdtp->dynticks_nesting);
@@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user)
528 struct rcu_dynticks *rdtp; 530 struct rcu_dynticks *rdtp;
529 long long oldval; 531 long long oldval;
530 532
531 rdtp = &__get_cpu_var(rcu_dynticks); 533 rdtp = this_cpu_ptr(&rcu_dynticks);
532 oldval = rdtp->dynticks_nesting; 534 oldval = rdtp->dynticks_nesting;
533 WARN_ON_ONCE(oldval < 0); 535 WARN_ON_ONCE(oldval < 0);
534 if (oldval & DYNTICK_TASK_NEST_MASK) 536 if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -555,7 +557,7 @@ void rcu_idle_exit(void)
555 557
556 local_irq_save(flags); 558 local_irq_save(flags);
557 rcu_eqs_exit(false); 559 rcu_eqs_exit(false);
558 rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); 560 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
559 local_irq_restore(flags); 561 local_irq_restore(flags);
560} 562}
561EXPORT_SYMBOL_GPL(rcu_idle_exit); 563EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -599,7 +601,7 @@ void rcu_irq_enter(void)
599 long long oldval; 601 long long oldval;
600 602
601 local_irq_save(flags); 603 local_irq_save(flags);
602 rdtp = &__get_cpu_var(rcu_dynticks); 604 rdtp = this_cpu_ptr(&rcu_dynticks);
603 oldval = rdtp->dynticks_nesting; 605 oldval = rdtp->dynticks_nesting;
604 rdtp->dynticks_nesting++; 606 rdtp->dynticks_nesting++;
605 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 607 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
@@ -620,7 +622,7 @@ void rcu_irq_enter(void)
620 */ 622 */
621void rcu_nmi_enter(void) 623void rcu_nmi_enter(void)
622{ 624{
623 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 625 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
624 626
625 if (rdtp->dynticks_nmi_nesting == 0 && 627 if (rdtp->dynticks_nmi_nesting == 0 &&
626 (atomic_read(&rdtp->dynticks) & 0x1)) 628 (atomic_read(&rdtp->dynticks) & 0x1))
@@ -642,7 +644,7 @@ void rcu_nmi_enter(void)
642 */ 644 */
643void rcu_nmi_exit(void) 645void rcu_nmi_exit(void)
644{ 646{
645 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 647 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
646 648
647 if (rdtp->dynticks_nmi_nesting == 0 || 649 if (rdtp->dynticks_nmi_nesting == 0 ||
648 --rdtp->dynticks_nmi_nesting != 0) 650 --rdtp->dynticks_nmi_nesting != 0)
@@ -655,21 +657,34 @@ void rcu_nmi_exit(void)
655} 657}
656 658
657/** 659/**
658 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 660 * __rcu_is_watching - are RCU read-side critical sections safe?
661 *
662 * Return true if RCU is watching the running CPU, which means that
663 * this CPU can safely enter RCU read-side critical sections. Unlike
664 * rcu_is_watching(), the caller of __rcu_is_watching() must have at
665 * least disabled preemption.
666 */
667bool notrace __rcu_is_watching(void)
668{
669 return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
670}
671
672/**
673 * rcu_is_watching - see if RCU thinks that the current CPU is idle
659 * 674 *
660 * If the current CPU is in its idle loop and is neither in an interrupt 675 * If the current CPU is in its idle loop and is neither in an interrupt
661 * or NMI handler, return true. 676 * or NMI handler, return true.
662 */ 677 */
663int rcu_is_cpu_idle(void) 678bool notrace rcu_is_watching(void)
664{ 679{
665 int ret; 680 int ret;
666 681
667 preempt_disable(); 682 preempt_disable();
668 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; 683 ret = __rcu_is_watching();
669 preempt_enable(); 684 preempt_enable();
670 return ret; 685 return ret;
671} 686}
672EXPORT_SYMBOL(rcu_is_cpu_idle); 687EXPORT_SYMBOL_GPL(rcu_is_watching);
673 688
674#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 689#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
675 690
@@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
703 if (in_nmi()) 718 if (in_nmi())
704 return 1; 719 return 1;
705 preempt_disable(); 720 preempt_disable();
706 rdp = &__get_cpu_var(rcu_sched_data); 721 rdp = this_cpu_ptr(&rcu_sched_data);
707 rnp = rdp->mynode; 722 rnp = rdp->mynode;
708 ret = (rdp->grpmask & rnp->qsmaskinit) || 723 ret = (rdp->grpmask & rnp->qsmaskinit) ||
709 !rcu_scheduler_fully_active; 724 !rcu_scheduler_fully_active;
@@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
723 */ 738 */
724static int rcu_is_cpu_rrupt_from_idle(void) 739static int rcu_is_cpu_rrupt_from_idle(void)
725{ 740{
726 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 741 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
727} 742}
728 743
729/* 744/*
@@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
802 817
803static void record_gp_stall_check_time(struct rcu_state *rsp) 818static void record_gp_stall_check_time(struct rcu_state *rsp)
804{ 819{
805 rsp->gp_start = jiffies; 820 unsigned long j = ACCESS_ONCE(jiffies);
806 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 821
822 rsp->gp_start = j;
823 smp_wmb(); /* Record start time before stall time. */
824 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
807} 825}
808 826
809/* 827/*
@@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
898 force_quiescent_state(rsp); /* Kick them all. */ 916 force_quiescent_state(rsp); /* Kick them all. */
899} 917}
900 918
919/*
920 * This function really isn't for public consumption, but RCU is special in
921 * that context switches can allow the state machine to make progress.
922 */
923extern void resched_cpu(int cpu);
924
901static void print_cpu_stall(struct rcu_state *rsp) 925static void print_cpu_stall(struct rcu_state *rsp)
902{ 926{
903 int cpu; 927 int cpu;
@@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
927 3 * rcu_jiffies_till_stall_check() + 3; 951 3 * rcu_jiffies_till_stall_check() + 3;
928 raw_spin_unlock_irqrestore(&rnp->lock, flags); 952 raw_spin_unlock_irqrestore(&rnp->lock, flags);
929 953
930 set_need_resched(); /* kick ourselves to get things going. */ 954 /*
955 * Attempt to revive the RCU machinery by forcing a context switch.
956 *
957 * A context switch would normally allow the RCU state machine to make
958 * progress and it could be we're stuck in kernel space without context
959 * switches for an entirely unreasonable amount of time.
960 */
961 resched_cpu(smp_processor_id());
931} 962}
932 963
933static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 964static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
934{ 965{
966 unsigned long completed;
967 unsigned long gpnum;
968 unsigned long gps;
935 unsigned long j; 969 unsigned long j;
936 unsigned long js; 970 unsigned long js;
937 struct rcu_node *rnp; 971 struct rcu_node *rnp;
938 972
939 if (rcu_cpu_stall_suppress) 973 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
940 return; 974 return;
941 j = ACCESS_ONCE(jiffies); 975 j = ACCESS_ONCE(jiffies);
976
977 /*
978 * Lots of memory barriers to reject false positives.
979 *
980 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
981 * then rsp->gp_start, and finally rsp->completed. These values
982 * are updated in the opposite order with memory barriers (or
983 * equivalent) during grace-period initialization and cleanup.
984 * Now, a false positive can occur if we get an new value of
985 * rsp->gp_start and a old value of rsp->jiffies_stall. But given
986 * the memory barriers, the only way that this can happen is if one
987 * grace period ends and another starts between these two fetches.
988 * Detect this by comparing rsp->completed with the previous fetch
989 * from rsp->gpnum.
990 *
991 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
992 * and rsp->gp_start suffice to forestall false positives.
993 */
994 gpnum = ACCESS_ONCE(rsp->gpnum);
995 smp_rmb(); /* Pick up ->gpnum first... */
942 js = ACCESS_ONCE(rsp->jiffies_stall); 996 js = ACCESS_ONCE(rsp->jiffies_stall);
997 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
998 gps = ACCESS_ONCE(rsp->gp_start);
999 smp_rmb(); /* ...and finally ->gp_start before ->completed. */
1000 completed = ACCESS_ONCE(rsp->completed);
1001 if (ULONG_CMP_GE(completed, gpnum) ||
1002 ULONG_CMP_LT(j, js) ||
1003 ULONG_CMP_GE(gps, js))
1004 return; /* No stall or GP completed since entering function. */
943 rnp = rdp->mynode; 1005 rnp = rdp->mynode;
944 if (rcu_gp_in_progress(rsp) && 1006 if (rcu_gp_in_progress(rsp) &&
945 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 1007 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
946 1008
947 /* We haven't checked in, so go dump stack. */ 1009 /* We haven't checked in, so go dump stack. */
948 print_cpu_stall(rsp); 1010 print_cpu_stall(rsp);
@@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1297} 1359}
1298 1360
1299/* 1361/*
1300 * Initialize a new grace period. 1362 * Initialize a new grace period. Return 0 if no grace period required.
1301 */ 1363 */
1302static int rcu_gp_init(struct rcu_state *rsp) 1364static int rcu_gp_init(struct rcu_state *rsp)
1303{ 1365{
@@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp)
1306 1368
1307 rcu_bind_gp_kthread(); 1369 rcu_bind_gp_kthread();
1308 raw_spin_lock_irq(&rnp->lock); 1370 raw_spin_lock_irq(&rnp->lock);
1371 if (rsp->gp_flags == 0) {
1372 /* Spurious wakeup, tell caller to go back to sleep. */
1373 raw_spin_unlock_irq(&rnp->lock);
1374 return 0;
1375 }
1309 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1376 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1310 1377
1311 if (rcu_gp_in_progress(rsp)) { 1378 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1312 /* Grace period already in progress, don't start another. */ 1379 /*
1380 * Grace period already in progress, don't start another.
1381 * Not supposed to be able to happen.
1382 */
1313 raw_spin_unlock_irq(&rnp->lock); 1383 raw_spin_unlock_irq(&rnp->lock);
1314 return 0; 1384 return 0;
1315 } 1385 }
1316 1386
1317 /* Advance to a new grace period and initialize state. */ 1387 /* Advance to a new grace period and initialize state. */
1388 record_gp_stall_check_time(rsp);
1389 smp_wmb(); /* Record GP times before starting GP. */
1318 rsp->gpnum++; 1390 rsp->gpnum++;
1319 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1391 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1320 record_gp_stall_check_time(rsp);
1321 raw_spin_unlock_irq(&rnp->lock); 1392 raw_spin_unlock_irq(&rnp->lock);
1322 1393
1323 /* Exclude any concurrent CPU-hotplug operations. */ 1394 /* Exclude any concurrent CPU-hotplug operations. */
@@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1366/* 1437/*
1367 * Do one round of quiescent-state forcing. 1438 * Do one round of quiescent-state forcing.
1368 */ 1439 */
1369int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1440static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1370{ 1441{
1371 int fqs_state = fqs_state_in; 1442 int fqs_state = fqs_state_in;
1372 bool isidle = false; 1443 bool isidle = false;
@@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1451 rsp->fqs_state = RCU_GP_IDLE; 1522 rsp->fqs_state = RCU_GP_IDLE;
1452 rdp = this_cpu_ptr(rsp->rda); 1523 rdp = this_cpu_ptr(rsp->rda);
1453 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1524 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1454 if (cpu_needs_another_gp(rsp, rdp)) 1525 if (cpu_needs_another_gp(rsp, rdp)) {
1455 rsp->gp_flags = 1; 1526 rsp->gp_flags = RCU_GP_FLAG_INIT;
1527 trace_rcu_grace_period(rsp->name,
1528 ACCESS_ONCE(rsp->gpnum),
1529 TPS("newreq"));
1530 }
1456 raw_spin_unlock_irq(&rnp->lock); 1531 raw_spin_unlock_irq(&rnp->lock);
1457} 1532}
1458 1533
@@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1462static int __noreturn rcu_gp_kthread(void *arg) 1537static int __noreturn rcu_gp_kthread(void *arg)
1463{ 1538{
1464 int fqs_state; 1539 int fqs_state;
1540 int gf;
1465 unsigned long j; 1541 unsigned long j;
1466 int ret; 1542 int ret;
1467 struct rcu_state *rsp = arg; 1543 struct rcu_state *rsp = arg;
@@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
1471 1547
1472 /* Handle grace-period start. */ 1548 /* Handle grace-period start. */
1473 for (;;) { 1549 for (;;) {
1550 trace_rcu_grace_period(rsp->name,
1551 ACCESS_ONCE(rsp->gpnum),
1552 TPS("reqwait"));
1474 wait_event_interruptible(rsp->gp_wq, 1553 wait_event_interruptible(rsp->gp_wq,
1475 rsp->gp_flags & 1554 ACCESS_ONCE(rsp->gp_flags) &
1476 RCU_GP_FLAG_INIT); 1555 RCU_GP_FLAG_INIT);
1477 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && 1556 if (rcu_gp_init(rsp))
1478 rcu_gp_init(rsp))
1479 break; 1557 break;
1480 cond_resched(); 1558 cond_resched();
1481 flush_signals(current); 1559 flush_signals(current);
1560 trace_rcu_grace_period(rsp->name,
1561 ACCESS_ONCE(rsp->gpnum),
1562 TPS("reqwaitsig"));
1482 } 1563 }
1483 1564
1484 /* Handle quiescent-state forcing. */ 1565 /* Handle quiescent-state forcing. */
@@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
1488 j = HZ; 1569 j = HZ;
1489 jiffies_till_first_fqs = HZ; 1570 jiffies_till_first_fqs = HZ;
1490 } 1571 }
1572 ret = 0;
1491 for (;;) { 1573 for (;;) {
1492 rsp->jiffies_force_qs = jiffies + j; 1574 if (!ret)
1575 rsp->jiffies_force_qs = jiffies + j;
1576 trace_rcu_grace_period(rsp->name,
1577 ACCESS_ONCE(rsp->gpnum),
1578 TPS("fqswait"));
1493 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1579 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1494 (rsp->gp_flags & RCU_GP_FLAG_FQS) || 1580 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1581 RCU_GP_FLAG_FQS) ||
1495 (!ACCESS_ONCE(rnp->qsmask) && 1582 (!ACCESS_ONCE(rnp->qsmask) &&
1496 !rcu_preempt_blocked_readers_cgp(rnp)), 1583 !rcu_preempt_blocked_readers_cgp(rnp)),
1497 j); 1584 j);
@@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
1500 !rcu_preempt_blocked_readers_cgp(rnp)) 1587 !rcu_preempt_blocked_readers_cgp(rnp))
1501 break; 1588 break;
1502 /* If time for quiescent-state forcing, do it. */ 1589 /* If time for quiescent-state forcing, do it. */
1503 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { 1590 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
1591 (gf & RCU_GP_FLAG_FQS)) {
1592 trace_rcu_grace_period(rsp->name,
1593 ACCESS_ONCE(rsp->gpnum),
1594 TPS("fqsstart"));
1504 fqs_state = rcu_gp_fqs(rsp, fqs_state); 1595 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1596 trace_rcu_grace_period(rsp->name,
1597 ACCESS_ONCE(rsp->gpnum),
1598 TPS("fqsend"));
1505 cond_resched(); 1599 cond_resched();
1506 } else { 1600 } else {
1507 /* Deal with stray signal. */ 1601 /* Deal with stray signal. */
1508 cond_resched(); 1602 cond_resched();
1509 flush_signals(current); 1603 flush_signals(current);
1604 trace_rcu_grace_period(rsp->name,
1605 ACCESS_ONCE(rsp->gpnum),
1606 TPS("fqswaitsig"));
1510 } 1607 }
1511 j = jiffies_till_next_fqs; 1608 j = jiffies_till_next_fqs;
1512 if (j > HZ) { 1609 if (j > HZ) {
@@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1554 return; 1651 return;
1555 } 1652 }
1556 rsp->gp_flags = RCU_GP_FLAG_INIT; 1653 rsp->gp_flags = RCU_GP_FLAG_INIT;
1654 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1655 TPS("newreq"));
1557 1656
1558 /* 1657 /*
1559 * We can't do wakeups while holding the rnp->lock, as that 1658 * We can't do wakeups while holding the rnp->lock, as that
@@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2255 * If called from an extended quiescent state, invoke the RCU 2354 * If called from an extended quiescent state, invoke the RCU
2256 * core in order to force a re-evaluation of RCU's idleness. 2355 * core in order to force a re-evaluation of RCU's idleness.
2257 */ 2356 */
2258 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) 2357 if (!rcu_is_watching() && cpu_online(smp_processor_id()))
2259 invoke_rcu_core(); 2358 invoke_rcu_core();
2260 2359
2261 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2360 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2725 2824
2726 for_each_rcu_flavor(rsp) { 2825 for_each_rcu_flavor(rsp) {
2727 rdp = per_cpu_ptr(rsp->rda, cpu); 2826 rdp = per_cpu_ptr(rsp->rda, cpu);
2728 if (rdp->qlen != rdp->qlen_lazy) 2827 if (!rdp->nxtlist)
2828 continue;
2829 hc = true;
2830 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
2729 al = false; 2831 al = false;
2730 if (rdp->nxtlist) 2832 break;
2731 hc = true; 2833 }
2732 } 2834 }
2733 if (all_lazy) 2835 if (all_lazy)
2734 *all_lazy = al; 2836 *all_lazy = al;
@@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3216 3318
3217/* 3319/*
3218 * Compute the rcu_node tree geometry from kernel parameters. This cannot 3320 * Compute the rcu_node tree geometry from kernel parameters. This cannot
3219 * replace the definitions in rcutree.h because those are needed to size 3321 * replace the definitions in tree.h because those are needed to size
3220 * the ->node array in the rcu_state structure. 3322 * the ->node array in the rcu_state structure.
3221 */ 3323 */
3222static void __init rcu_init_geometry(void) 3324static void __init rcu_init_geometry(void)
@@ -3295,8 +3397,8 @@ void __init rcu_init(void)
3295 3397
3296 rcu_bootup_announce(); 3398 rcu_bootup_announce();
3297 rcu_init_geometry(); 3399 rcu_init_geometry();
3298 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3299 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3400 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3401 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3300 __rcu_init_preempt(); 3402 __rcu_init_preempt();
3301 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3403 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3302 3404
@@ -3311,4 +3413,4 @@ void __init rcu_init(void)
3311 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3413 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3312} 3414}
3313 3415
3314#include "rcutree_plugin.h" 3416#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index 5f97eab602cd..52be957c9fe2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -104,6 +104,8 @@ struct rcu_dynticks {
104 /* idle-period nonlazy_posted snapshot. */ 104 /* idle-period nonlazy_posted snapshot. */
105 unsigned long last_accelerate; 105 unsigned long last_accelerate;
106 /* Last jiffy CBs were accelerated. */ 106 /* Last jiffy CBs were accelerated. */
107 unsigned long last_advance_all;
108 /* Last jiffy CBs were all advanced. */
107 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 109 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
108#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 110#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
109}; 111};
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h
index 130c97b027f2..6abb03dff5c0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include "time/tick-internal.h" 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98 pr_info("\tOffload RCU callbacks from all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
99 cpumask_setall(rcu_nocb_mask); 99 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
102 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
103 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
104 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
105 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
106 rcu_nocb_mask);
107 }
103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 108 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 109 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
105 if (rcu_nocb_poll) 110 if (rcu_nocb_poll)
@@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
660 665
661static void rcu_preempt_do_callbacks(void) 666static void rcu_preempt_do_callbacks(void)
662{ 667{
663 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); 668 rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
664} 669}
665 670
666#endif /* #ifdef CONFIG_RCU_BOOST */ 671#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1128,7 +1133,7 @@ void exit_rcu(void)
1128 1133
1129#ifdef CONFIG_RCU_BOOST 1134#ifdef CONFIG_RCU_BOOST
1130 1135
1131#include "rtmutex_common.h" 1136#include "../locking/rtmutex_common.h"
1132 1137
1133#ifdef CONFIG_RCU_TRACE 1138#ifdef CONFIG_RCU_TRACE
1134 1139
@@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
1332 */ 1337 */
1333static bool rcu_is_callbacks_kthread(void) 1338static bool rcu_is_callbacks_kthread(void)
1334{ 1339{
1335 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1340 return __this_cpu_read(rcu_cpu_kthread_task) == current;
1336} 1341}
1337 1342
1338#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1382 1387
1383static void rcu_kthread_do_work(void) 1388static void rcu_kthread_do_work(void)
1384{ 1389{
1385 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1390 rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1386 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1391 rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1387 rcu_preempt_do_callbacks(); 1392 rcu_preempt_do_callbacks();
1388} 1393}
1389 1394
@@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
1402 1407
1403static int rcu_cpu_kthread_should_run(unsigned int cpu) 1408static int rcu_cpu_kthread_should_run(unsigned int cpu)
1404{ 1409{
1405 return __get_cpu_var(rcu_cpu_has_work); 1410 return __this_cpu_read(rcu_cpu_has_work);
1406} 1411}
1407 1412
1408/* 1413/*
@@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
1412 */ 1417 */
1413static void rcu_cpu_kthread(unsigned int cpu) 1418static void rcu_cpu_kthread(unsigned int cpu)
1414{ 1419{
1415 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); 1420 unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1416 char work, *workp = &__get_cpu_var(rcu_cpu_has_work); 1421 char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1417 int spincnt; 1422 int spincnt;
1418 1423
1419 for (spincnt = 0; spincnt < 10; spincnt++) { 1424 for (spincnt = 0; spincnt < 10; spincnt++) {
@@ -1630,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);
1630extern int tick_nohz_enabled; 1635extern int tick_nohz_enabled;
1631 1636
1632/* 1637/*
1633 * Try to advance callbacks for all flavors of RCU on the current CPU. 1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but
1634 * Afterwards, if there are any callbacks ready for immediate invocation, 1639 * only if it has been awhile since the last time we did so. Afterwards,
1635 * return true. 1640 * if there are any callbacks ready for immediate invocation, return true.
1636 */ 1641 */
1637static bool rcu_try_advance_all_cbs(void) 1642static bool rcu_try_advance_all_cbs(void)
1638{ 1643{
1639 bool cbs_ready = false; 1644 bool cbs_ready = false;
1640 struct rcu_data *rdp; 1645 struct rcu_data *rdp;
1646 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1641 struct rcu_node *rnp; 1647 struct rcu_node *rnp;
1642 struct rcu_state *rsp; 1648 struct rcu_state *rsp;
1643 1649
1650 /* Exit early if we advanced recently. */
1651 if (jiffies == rdtp->last_advance_all)
1652 return 0;
1653 rdtp->last_advance_all = jiffies;
1654
1644 for_each_rcu_flavor(rsp) { 1655 for_each_rcu_flavor(rsp) {
1645 rdp = this_cpu_ptr(rsp->rda); 1656 rdp = this_cpu_ptr(rsp->rda);
1646 rnp = rdp->mynode; 1657 rnp = rdp->mynode;
@@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
1739 */ 1750 */
1740 if (rdtp->all_lazy && 1751 if (rdtp->all_lazy &&
1741 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { 1752 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1753 rdtp->all_lazy = false;
1754 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1742 invoke_rcu_core(); 1755 invoke_rcu_core();
1743 return; 1756 return;
1744 } 1757 }
@@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
1768 */ 1781 */
1769static void rcu_cleanup_after_idle(int cpu) 1782static void rcu_cleanup_after_idle(int cpu)
1770{ 1783{
1771 struct rcu_data *rdp;
1772 struct rcu_state *rsp;
1773 1784
1774 if (rcu_is_nocb_cpu(cpu)) 1785 if (rcu_is_nocb_cpu(cpu))
1775 return; 1786 return;
1776 rcu_try_advance_all_cbs(); 1787 if (rcu_try_advance_all_cbs())
1777 for_each_rcu_flavor(rsp) { 1788 invoke_rcu_core();
1778 rdp = per_cpu_ptr(rsp->rda, cpu);
1779 if (cpu_has_callbacks_ready_to_invoke(rdp))
1780 invoke_rcu_core();
1781 }
1782} 1789}
1783 1790
1784/* 1791/*
@@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2108 2115
2109 /* If we are not being polled and there is a kthread, awaken it ... */ 2116 /* If we are not being polled and there is a kthread, awaken it ... */
2110 t = ACCESS_ONCE(rdp->nocb_kthread); 2117 t = ACCESS_ONCE(rdp->nocb_kthread);
2111 if (rcu_nocb_poll | !t) 2118 if (rcu_nocb_poll || !t) {
2119 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2120 TPS("WakeNotPoll"));
2112 return; 2121 return;
2122 }
2113 len = atomic_long_read(&rdp->nocb_q_count); 2123 len = atomic_long_read(&rdp->nocb_q_count);
2114 if (old_rhpp == &rdp->nocb_head) { 2124 if (old_rhpp == &rdp->nocb_head) {
2115 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2116 rdp->qlen_last_fqs_check = 0; 2126 rdp->qlen_last_fqs_check = 0;
2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2117 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2118 wake_up_process(t); /* ... or if many callbacks queued. */ 2129 wake_up_process(t); /* ... or if many callbacks queued. */
2119 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2130 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2131 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2132 } else {
2133 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2120 } 2134 }
2121 return; 2135 return;
2122} 2136}
@@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2140 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2141 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2142 (unsigned long)rhp->func, 2156 (unsigned long)rhp->func,
2143 rdp->qlen_lazy, rdp->qlen); 2157 -atomic_long_read(&rdp->nocb_q_count_lazy),
2158 -atomic_long_read(&rdp->nocb_q_count));
2144 else 2159 else
2145 trace_rcu_callback(rdp->rsp->name, rhp, 2160 trace_rcu_callback(rdp->rsp->name, rhp,
2146 rdp->qlen_lazy, rdp->qlen); 2161 -atomic_long_read(&rdp->nocb_q_count_lazy),
2162 -atomic_long_read(&rdp->nocb_q_count));
2147 return 1; 2163 return 1;
2148} 2164}
2149 2165
@@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2221static int rcu_nocb_kthread(void *arg) 2237static int rcu_nocb_kthread(void *arg)
2222{ 2238{
2223 int c, cl; 2239 int c, cl;
2240 bool firsttime = 1;
2224 struct rcu_head *list; 2241 struct rcu_head *list;
2225 struct rcu_head *next; 2242 struct rcu_head *next;
2226 struct rcu_head **tail; 2243 struct rcu_head **tail;
@@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
2229 /* Each pass through this loop invokes one batch of callbacks */ 2246 /* Each pass through this loop invokes one batch of callbacks */
2230 for (;;) { 2247 for (;;) {
2231 /* If not polling, wait for next batch of callbacks. */ 2248 /* If not polling, wait for next batch of callbacks. */
2232 if (!rcu_nocb_poll) 2249 if (!rcu_nocb_poll) {
2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2251 TPS("Sleep"));
2233 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2253 } else if (firsttime) {
2254 firsttime = 0;
2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2256 TPS("Poll"));
2257 }
2234 list = ACCESS_ONCE(rdp->nocb_head); 2258 list = ACCESS_ONCE(rdp->nocb_head);
2235 if (!list) { 2259 if (!list) {
2260 if (!rcu_nocb_poll)
2261 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2262 TPS("WokeEmpty"));
2236 schedule_timeout_interruptible(1); 2263 schedule_timeout_interruptible(1);
2237 flush_signals(current); 2264 flush_signals(current);
2238 continue; 2265 continue;
2239 } 2266 }
2267 firsttime = 1;
2268 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2269 TPS("WokeNonEmpty"));
2240 2270
2241 /* 2271 /*
2242 * Extract queued callbacks, update counts, and wait 2272 * Extract queued callbacks, update counts, and wait
@@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
2257 next = list->next; 2287 next = list->next;
2258 /* Wait for enqueuing to complete, if needed. */ 2288 /* Wait for enqueuing to complete, if needed. */
2259 while (next == NULL && &list->next != tail) { 2289 while (next == NULL && &list->next != tail) {
2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2291 TPS("WaitQueue"));
2260 schedule_timeout_interruptible(1); 2292 schedule_timeout_interruptible(1);
2293 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2294 TPS("WokeQueue"));
2261 next = list->next; 2295 next = list->next;
2262 } 2296 }
2263 debug_rcu_head_unqueue(list); 2297 debug_rcu_head_unqueue(list);
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index cf6c17412932..3596797b7e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "tree.h"
48 48
49static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 50 const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index 33eb4620aa17..6cb3dff89e2b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
53 53
54#include "rcu.h" 54#include "rcu.h"
55 55
56MODULE_ALIAS("rcupdate");
57#ifdef MODULE_PARAM_PREFIX
58#undef MODULE_PARAM_PREFIX
59#endif
60#define MODULE_PARAM_PREFIX "rcupdate."
61
56module_param(rcu_expedited, int, 0); 62module_param(rcu_expedited, int, 0);
57 63
58#ifdef CONFIG_PREEMPT_RCU 64#ifdef CONFIG_PREEMPT_RCU
@@ -122,7 +128,7 @@ struct lockdep_map rcu_sched_lock_map =
122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 128 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
123EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 129EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
124 130
125int debug_lockdep_rcu_enabled(void) 131int notrace debug_lockdep_rcu_enabled(void)
126{ 132{
127 return rcu_scheduler_active && debug_locks && 133 return rcu_scheduler_active && debug_locks &&
128 current->lockdep_recursion == 0; 134 current->lockdep_recursion == 0;
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
148{ 154{
149 if (!debug_lockdep_rcu_enabled()) 155 if (!debug_lockdep_rcu_enabled())
150 return 1; 156 return 1;
151 if (rcu_is_cpu_idle()) 157 if (!rcu_is_watching())
152 return 0; 158 return 0;
153 if (!rcu_lockdep_current_cpu_online()) 159 if (!rcu_lockdep_current_cpu_online())
154 return 0; 160 return 0;
@@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
298#endif 304#endif
299 305
300int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 306int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
301int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 307static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
302 308
303module_param(rcu_cpu_stall_suppress, int, 0644); 309module_param(rcu_cpu_stall_suppress, int, 0644);
304module_param(rcu_cpu_stall_timeout, int, 0644); 310module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
32#endif 32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; 33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34 34
35int reboot_default; 35/*
36 * This variable is used privately to keep track of whether or not
37 * reboot_type is still set to its default value (i.e., reboot= hasn't
38 * been set on the command line). This is needed so that we can
39 * suppress DMI scanning for reboot quirks. Without it, it's
40 * impossible to override a faulty reboot quirk without recompiling.
41 */
42int reboot_default = 1;
36int reboot_cpu; 43int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI; 44enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force; 45int reboot_force;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e7049..4aa8a305aede 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,8 +17,8 @@
17void res_counter_init(struct res_counter *counter, struct res_counter *parent) 17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{ 18{
19 spin_lock_init(&counter->lock); 19 spin_lock_init(&counter->lock);
20 counter->limit = RESOURCE_MAX; 20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RESOURCE_MAX; 21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
178#endif 178#endif
179 179
180int res_counter_memparse_write_strategy(const char *buf, 180int res_counter_memparse_write_strategy(const char *buf,
181 unsigned long long *res) 181 unsigned long long *resp)
182{ 182{
183 char *end; 183 char *end;
184 unsigned long long res;
184 185
185 /* return RESOURCE_MAX(unlimited) if "-1" is specified */ 186 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
186 if (*buf == '-') { 187 if (*buf == '-') {
187 *res = simple_strtoull(buf + 1, &end, 10); 188 res = simple_strtoull(buf + 1, &end, 10);
188 if (*res != 1 || *end != '\0') 189 if (res != 1 || *end != '\0')
189 return -EINVAL; 190 return -EINVAL;
190 *res = RESOURCE_MAX; 191 *resp = RES_COUNTER_MAX;
191 return 0; 192 return 0;
192 } 193 }
193 194
194 *res = memparse(buf, &end); 195 res = memparse(buf, &end);
195 if (*end != '\0') 196 if (*end != '\0')
196 return -EINVAL; 197 return -EINVAL;
197 198
198 *res = PAGE_ALIGN(*res); 199 if (PAGE_ALIGN(res) >= res)
200 res = PAGE_ALIGN(res);
201 else
202 res = RES_COUNTER_MAX;
203
204 *resp = res;
205
199 return 0; 206 return 0;
200} 207}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..7b621409cf15 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-y += wait.o completion.o
15obj-$(CONFIG_SMP) += cpupri.o 16obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 18obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000000..a63f4dc27909
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
1/*
2 * Generic wait-for-completion handler;
3 *
4 * It differs from semaphores in that their default case is the opposite,
5 * wait_for_completion default blocks whereas semaphore default non-block. The
6 * interface also makes it easy to 'complete' multiple waiting threads,
7 * something which isn't entirely natural for semaphores.
8 *
9 * But more importantly, the primitive documents the usage. Semaphores would
10 * typically be used for exclusion which gives rise to priority inversion.
11 * Waiting for completion is a typically sync point, but not an exclusion point.
12 */
13
14#include <linux/sched.h>
15#include <linux/completion.h>
16
17/**
18 * complete: - signals a single thread waiting on this completion
19 * @x: holds the state of this particular completion
20 *
21 * This will wake up a single thread waiting on this completion. Threads will be
22 * awakened in the same order in which they were queued.
23 *
24 * See also complete_all(), wait_for_completion() and related routines.
25 *
26 * It may be assumed that this function implies a write memory barrier before
27 * changing the task state if and only if any tasks are woken up.
28 */
29void complete(struct completion *x)
30{
31 unsigned long flags;
32
33 spin_lock_irqsave(&x->wait.lock, flags);
34 x->done++;
35 __wake_up_locked(&x->wait, TASK_NORMAL, 1);
36 spin_unlock_irqrestore(&x->wait.lock, flags);
37}
38EXPORT_SYMBOL(complete);
39
40/**
41 * complete_all: - signals all threads waiting on this completion
42 * @x: holds the state of this particular completion
43 *
44 * This will wake up all threads waiting on this particular completion event.
45 *
46 * It may be assumed that this function implies a write memory barrier before
47 * changing the task state if and only if any tasks are woken up.
48 */
49void complete_all(struct completion *x)
50{
51 unsigned long flags;
52
53 spin_lock_irqsave(&x->wait.lock, flags);
54 x->done += UINT_MAX/2;
55 __wake_up_locked(&x->wait, TASK_NORMAL, 0);
56 spin_unlock_irqrestore(&x->wait.lock, flags);
57}
58EXPORT_SYMBOL(complete_all);
59
60static inline long __sched
61do_wait_for_common(struct completion *x,
62 long (*action)(long), long timeout, int state)
63{
64 if (!x->done) {
65 DECLARE_WAITQUEUE(wait, current);
66
67 __add_wait_queue_tail_exclusive(&x->wait, &wait);
68 do {
69 if (signal_pending_state(state, current)) {
70 timeout = -ERESTARTSYS;
71 break;
72 }
73 __set_current_state(state);
74 spin_unlock_irq(&x->wait.lock);
75 timeout = action(timeout);
76 spin_lock_irq(&x->wait.lock);
77 } while (!x->done && timeout);
78 __remove_wait_queue(&x->wait, &wait);
79 if (!x->done)
80 return timeout;
81 }
82 x->done--;
83 return timeout ?: 1;
84}
85
86static inline long __sched
87__wait_for_common(struct completion *x,
88 long (*action)(long), long timeout, int state)
89{
90 might_sleep();
91
92 spin_lock_irq(&x->wait.lock);
93 timeout = do_wait_for_common(x, action, timeout, state);
94 spin_unlock_irq(&x->wait.lock);
95 return timeout;
96}
97
98static long __sched
99wait_for_common(struct completion *x, long timeout, int state)
100{
101 return __wait_for_common(x, schedule_timeout, timeout, state);
102}
103
104static long __sched
105wait_for_common_io(struct completion *x, long timeout, int state)
106{
107 return __wait_for_common(x, io_schedule_timeout, timeout, state);
108}
109
110/**
111 * wait_for_completion: - waits for completion of a task
112 * @x: holds the state of this particular completion
113 *
114 * This waits to be signaled for completion of a specific task. It is NOT
115 * interruptible and there is no timeout.
116 *
117 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
118 * and interrupt capability. Also see complete().
119 */
120void __sched wait_for_completion(struct completion *x)
121{
122 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
123}
124EXPORT_SYMBOL(wait_for_completion);
125
126/**
127 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
128 * @x: holds the state of this particular completion
129 * @timeout: timeout value in jiffies
130 *
131 * This waits for either a completion of a specific task to be signaled or for a
132 * specified timeout to expire. The timeout is in jiffies. It is not
133 * interruptible.
134 *
135 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
136 * till timeout) if completed.
137 */
138unsigned long __sched
139wait_for_completion_timeout(struct completion *x, unsigned long timeout)
140{
141 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
142}
143EXPORT_SYMBOL(wait_for_completion_timeout);
144
145/**
146 * wait_for_completion_io: - waits for completion of a task
147 * @x: holds the state of this particular completion
148 *
149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO.
152 */
153void __sched wait_for_completion_io(struct completion *x)
154{
155 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
156}
157EXPORT_SYMBOL(wait_for_completion_io);
158
159/**
160 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
161 * @x: holds the state of this particular completion
162 * @timeout: timeout value in jiffies
163 *
164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO.
167 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed.
170 */
171unsigned long __sched
172wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
173{
174 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
175}
176EXPORT_SYMBOL(wait_for_completion_io_timeout);
177
178/**
179 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
180 * @x: holds the state of this particular completion
181 *
182 * This waits for completion of a specific task to be signaled. It is
183 * interruptible.
184 *
185 * Return: -ERESTARTSYS if interrupted, 0 if completed.
186 */
187int __sched wait_for_completion_interruptible(struct completion *x)
188{
189 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
190 if (t == -ERESTARTSYS)
191 return t;
192 return 0;
193}
194EXPORT_SYMBOL(wait_for_completion_interruptible);
195
196/**
197 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
198 * @x: holds the state of this particular completion
199 * @timeout: timeout value in jiffies
200 *
201 * This waits for either a completion of a specific task to be signaled or for a
202 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
203 *
204 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
205 * or number of jiffies left till timeout) if completed.
206 */
207long __sched
208wait_for_completion_interruptible_timeout(struct completion *x,
209 unsigned long timeout)
210{
211 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
212}
213EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
214
215/**
216 * wait_for_completion_killable: - waits for completion of a task (killable)
217 * @x: holds the state of this particular completion
218 *
219 * This waits to be signaled for completion of a specific task. It can be
220 * interrupted by a kill signal.
221 *
222 * Return: -ERESTARTSYS if interrupted, 0 if completed.
223 */
224int __sched wait_for_completion_killable(struct completion *x)
225{
226 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
227 if (t == -ERESTARTSYS)
228 return t;
229 return 0;
230}
231EXPORT_SYMBOL(wait_for_completion_killable);
232
233/**
234 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
235 * @x: holds the state of this particular completion
236 * @timeout: timeout value in jiffies
237 *
238 * This waits for either a completion of a specific task to be
239 * signaled or for a specified timeout to expire. It can be
240 * interrupted by a kill signal. The timeout is in jiffies.
241 *
242 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
243 * or number of jiffies left till timeout) if completed.
244 */
245long __sched
246wait_for_completion_killable_timeout(struct completion *x,
247 unsigned long timeout)
248{
249 return wait_for_common(x, timeout, TASK_KILLABLE);
250}
251EXPORT_SYMBOL(wait_for_completion_killable_timeout);
252
253/**
254 * try_wait_for_completion - try to decrement a completion without blocking
255 * @x: completion structure
256 *
257 * Return: 0 if a decrement cannot be done without blocking
258 * 1 if a decrement succeeded.
259 *
260 * If a completion is being used as a counting completion,
261 * attempt to decrement the counter without blocking. This
262 * enables us to avoid waiting if the resource the completion
263 * is protecting is not available.
264 */
265bool try_wait_for_completion(struct completion *x)
266{
267 unsigned long flags;
268 int ret = 1;
269
270 spin_lock_irqsave(&x->wait.lock, flags);
271 if (!x->done)
272 ret = 0;
273 else
274 x->done--;
275 spin_unlock_irqrestore(&x->wait.lock, flags);
276 return ret;
277}
278EXPORT_SYMBOL(try_wait_for_completion);
279
280/**
281 * completion_done - Test to see if a completion has any waiters
282 * @x: completion structure
283 *
284 * Return: 0 if there are waiters (wait_for_completion() in progress)
285 * 1 if there are no waiters.
286 *
287 */
288bool completion_done(struct completion *x)
289{
290 unsigned long flags;
291 int ret = 1;
292
293 spin_lock_irqsave(&x->wait.lock, flags);
294 if (!x->done)
295 ret = 0;
296 spin_unlock_irqrestore(&x->wait.lock, flags);
297 return ret;
298}
299EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..c1808606ee5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
513 * might also involve a cross-CPU call to trigger the scheduler on 513 * might also involve a cross-CPU call to trigger the scheduler on
514 * the target CPU. 514 * the target CPU.
515 */ 515 */
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p) 516void resched_task(struct task_struct *p)
518{ 517{
519 int cpu; 518 int cpu;
520 519
521 assert_raw_spin_locked(&task_rq(p)->lock); 520 lockdep_assert_held(&task_rq(p)->lock);
522 521
523 if (test_tsk_need_resched(p)) 522 if (test_tsk_need_resched(p))
524 return; 523 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
526 set_tsk_need_resched(p); 525 set_tsk_need_resched(p);
527 526
528 cpu = task_cpu(p); 527 cpu = task_cpu(p);
529 if (cpu == smp_processor_id()) 528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return; 530 return;
531 }
531 532
532 /* NEED_RESCHED must be visible before we test polling */ 533 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 534 smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 raw_spin_unlock_irqrestore(&rq->lock, flags);
547} 548}
548 549
550#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON 551#ifdef CONFIG_NO_HZ_COMMON
550/* 552/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers 553 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
693 } 695 }
694} 696}
695 697
696#else /* !CONFIG_SMP */
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif /* CONFIG_SMP */ 698#endif /* CONFIG_SMP */
703 699
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{ 764{
769 update_rq_clock(rq); 765 update_rq_clock(rq);
770 sched_info_queued(p); 766 sched_info_queued(rq, p);
771 p->sched_class->enqueue_task(rq, p, flags); 767 p->sched_class->enqueue_task(rq, p, flags);
772} 768}
773 769
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{ 771{
776 update_rq_clock(rq); 772 update_rq_clock(rq);
777 sched_info_dequeued(p); 773 sched_info_dequeued(rq, p);
778 p->sched_class->dequeue_task(rq, p, flags); 774 p->sched_class->dequeue_task(rq, p, flags);
779} 775}
780 776
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
987 * ttwu() will sort out the placement. 983 * ttwu() will sort out the placement.
988 */ 984 */
989 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
990 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
991 987
992#ifdef CONFIG_LOCKDEP 988#ifdef CONFIG_LOCKDEP
993 /* 989 /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1017 __set_task_cpu(p, new_cpu); 1013 __set_task_cpu(p, new_cpu);
1018} 1014}
1019 1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029 /*
1030 * Task isn't running anymore; make it appear like we migrated
1031 * it before it went to sleep. This means on wakeup we make the
1032 * previous cpu our targer instead of where it really is.
1033 */
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080/*
1081 * Cross migrate two tasks
1082 */
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098 /*
1099 * These three tests are all lockless; this is OK since all of them
1100 * will be re-checked with proper locks held further down the line.
1101 */
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112
1113out:
1114 return ret;
1115}
1116
1020struct migration_arg { 1117struct migration_arg {
1021 struct task_struct *task; 1118 struct task_struct *task;
1022 int dest_cpu; 1119 int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
1236 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1333 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1237 */ 1334 */
1238static inline 1335static inline
1239int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1336int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1240{ 1337{
1241 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1338 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1242 1339
1243 /* 1340 /*
1244 * In order not to call set_task_cpu() on a blocking task we need 1341 * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1330 1427
1331 if (rq->idle_stamp) { 1428 if (rq->idle_stamp) {
1332 u64 delta = rq_clock(rq) - rq->idle_stamp; 1429 u64 delta = rq_clock(rq) - rq->idle_stamp;
1333 u64 max = 2*sysctl_sched_migration_cost; 1430 u64 max = 2*rq->max_idle_balance_cost;
1334 1431
1335 if (delta > max) 1432 update_avg(&rq->avg_idle, delta);
1433
1434 if (rq->avg_idle > max)
1336 rq->avg_idle = max; 1435 rq->avg_idle = max;
1337 else 1436
1338 update_avg(&rq->avg_idle, delta);
1339 rq->idle_stamp = 0; 1437 rq->idle_stamp = 0;
1340 } 1438 }
1341#endif 1439#endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
1396 1494
1397void scheduler_ipi(void) 1495void scheduler_ipi(void)
1398{ 1496{
1497 /*
1498 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI.
1501 */
1502 if (tif_need_resched())
1503 set_preempt_need_resched();
1504
1399 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1400 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
1401 && !got_nohz_idle_kick()) 1507 && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1513 if (p->sched_class->task_waking) 1619 if (p->sched_class->task_waking)
1514 p->sched_class->task_waking(p); 1620 p->sched_class->task_waking(p);
1515 1621
1516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1517 if (task_cpu(p) != cpu) { 1623 if (task_cpu(p) != cpu) {
1518 wake_flags |= WF_MIGRATED; 1624 wake_flags |= WF_MIGRATED;
1519 set_task_cpu(p, cpu); 1625 set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1595 * 1701 *
1596 * __sched_fork() is basic setup used by init_idle() too: 1702 * __sched_fork() is basic setup used by init_idle() too:
1597 */ 1703 */
1598static void __sched_fork(struct task_struct *p) 1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1599{ 1705{
1600 p->on_rq = 0; 1706 p->on_rq = 0;
1601 1707
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
1619 1725
1620#ifdef CONFIG_NUMA_BALANCING 1726#ifdef CONFIG_NUMA_BALANCING
1621 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1727 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1622 p->mm->numa_next_scan = jiffies; 1728 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1623 p->mm->numa_next_reset = jiffies;
1624 p->mm->numa_scan_seq = 0; 1729 p->mm->numa_scan_seq = 0;
1625 } 1730 }
1626 1731
1732 if (clone_flags & CLONE_VM)
1733 p->numa_preferred_nid = current->numa_preferred_nid;
1734 else
1735 p->numa_preferred_nid = -1;
1736
1627 p->node_stamp = 0ULL; 1737 p->node_stamp = 0ULL;
1628 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1738 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1629 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1630 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1739 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1631 p->numa_work.next = &p->numa_work; 1740 p->numa_work.next = &p->numa_work;
1741 p->numa_faults = NULL;
1742 p->numa_faults_buffer = NULL;
1743
1744 INIT_LIST_HEAD(&p->numa_entry);
1745 p->numa_group = NULL;
1632#endif /* CONFIG_NUMA_BALANCING */ 1746#endif /* CONFIG_NUMA_BALANCING */
1633} 1747}
1634 1748
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
1654/* 1768/*
1655 * fork()/clone()-time setup: 1769 * fork()/clone()-time setup:
1656 */ 1770 */
1657void sched_fork(struct task_struct *p) 1771void sched_fork(unsigned long clone_flags, struct task_struct *p)
1658{ 1772{
1659 unsigned long flags; 1773 unsigned long flags;
1660 int cpu = get_cpu(); 1774 int cpu = get_cpu();
1661 1775
1662 __sched_fork(p); 1776 __sched_fork(clone_flags, p);
1663 /* 1777 /*
1664 * We mark the process as running here. This guarantees that 1778 * We mark the process as running here. This guarantees that
1665 * nobody will actually run it, and a signal or other external 1779 * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
1717#if defined(CONFIG_SMP) 1831#if defined(CONFIG_SMP)
1718 p->on_cpu = 0; 1832 p->on_cpu = 0;
1719#endif 1833#endif
1720#ifdef CONFIG_PREEMPT_COUNT 1834 init_task_preempt_count(p);
1721 /* Want to start with kernel preemption disabled. */
1722 task_thread_info(p)->preempt_count = 1;
1723#endif
1724#ifdef CONFIG_SMP 1835#ifdef CONFIG_SMP
1725 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1836 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1726#endif 1837#endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
1747 * - cpus_allowed can change in the fork path 1858 * - cpus_allowed can change in the fork path
1748 * - any previously selected cpu might disappear through hotplug 1859 * - any previously selected cpu might disappear through hotplug
1749 */ 1860 */
1750 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1861 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1751#endif 1862#endif
1752 1863
1753 /* Initialize new task's runnable average */ 1864 /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next) 1949 struct task_struct *next)
1839{ 1950{
1840 trace_sched_switch(prev, next); 1951 trace_sched_switch(prev, next);
1841 sched_info_switch(prev, next); 1952 sched_info_switch(rq, prev, next);
1842 perf_event_task_sched_out(prev, next); 1953 perf_event_task_sched_out(prev, next);
1843 fire_sched_out_preempt_notifiers(prev, next); 1954 fire_sched_out_preempt_notifiers(prev, next);
1844 prepare_lock_switch(rq, next); 1955 prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1890 if (mm) 2001 if (mm)
1891 mmdrop(mm); 2002 mmdrop(mm);
1892 if (unlikely(prev_state == TASK_DEAD)) { 2003 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev);
2005
1893 /* 2006 /*
1894 * Remove function-return probe instances associated with this 2007 * Remove function-return probe instances associated with this
1895 * task and put them back on the free list. 2008 * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
2073 int dest_cpu; 2186 int dest_cpu;
2074 2187
2075 raw_spin_lock_irqsave(&p->pi_lock, flags); 2188 raw_spin_lock_irqsave(&p->pi_lock, flags);
2076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2189 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2077 if (dest_cpu == smp_processor_id()) 2190 if (dest_cpu == smp_processor_id())
2078 goto unlock; 2191 goto unlock;
2079 2192
@@ -2140,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2140 struct rq *rq; 2253 struct rq *rq;
2141 u64 ns = 0; 2254 u64 ns = 0;
2142 2255
2256#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2257 /*
2258 * 64-bit doesn't need locks to atomically read a 64bit value.
2259 * So we have a optimization chance when the task's delta_exec is 0.
2260 * Reading ->on_cpu is racy, but this is ok.
2261 *
2262 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2263 * If we race with it entering cpu, unaccounted time is 0. This is
2264 * indistinguishable from the read occurring a few cycles earlier.
2265 */
2266 if (!p->on_cpu)
2267 return p->se.sum_exec_runtime;
2268#endif
2269
2143 rq = task_rq_lock(p, &flags); 2270 rq = task_rq_lock(p, &flags);
2144 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2271 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2145 task_rq_unlock(rq, p, &flags); 2272 task_rq_unlock(rq, p, &flags);
@@ -2215,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2215#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2342#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2216 defined(CONFIG_PREEMPT_TRACER)) 2343 defined(CONFIG_PREEMPT_TRACER))
2217 2344
2218void __kprobes add_preempt_count(int val) 2345void __kprobes preempt_count_add(int val)
2219{ 2346{
2220#ifdef CONFIG_DEBUG_PREEMPT 2347#ifdef CONFIG_DEBUG_PREEMPT
2221 /* 2348 /*
@@ -2224,7 +2351,7 @@ void __kprobes add_preempt_count(int val)
2224 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2351 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2225 return; 2352 return;
2226#endif 2353#endif
2227 preempt_count() += val; 2354 __preempt_count_add(val);
2228#ifdef CONFIG_DEBUG_PREEMPT 2355#ifdef CONFIG_DEBUG_PREEMPT
2229 /* 2356 /*
2230 * Spinlock count overflowing soon? 2357 * Spinlock count overflowing soon?
@@ -2235,9 +2362,9 @@ void __kprobes add_preempt_count(int val)
2235 if (preempt_count() == val) 2362 if (preempt_count() == val)
2236 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2363 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2237} 2364}
2238EXPORT_SYMBOL(add_preempt_count); 2365EXPORT_SYMBOL(preempt_count_add);
2239 2366
2240void __kprobes sub_preempt_count(int val) 2367void __kprobes preempt_count_sub(int val)
2241{ 2368{
2242#ifdef CONFIG_DEBUG_PREEMPT 2369#ifdef CONFIG_DEBUG_PREEMPT
2243 /* 2370 /*
@@ -2255,9 +2382,9 @@ void __kprobes sub_preempt_count(int val)
2255 2382
2256 if (preempt_count() == val) 2383 if (preempt_count() == val)
2257 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2384 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2258 preempt_count() -= val; 2385 __preempt_count_sub(val);
2259} 2386}
2260EXPORT_SYMBOL(sub_preempt_count); 2387EXPORT_SYMBOL(preempt_count_sub);
2261 2388
2262#endif 2389#endif
2263 2390
@@ -2430,6 +2557,7 @@ need_resched:
2430 put_prev_task(rq, prev); 2557 put_prev_task(rq, prev);
2431 next = pick_next_task(rq); 2558 next = pick_next_task(rq);
2432 clear_tsk_need_resched(prev); 2559 clear_tsk_need_resched(prev);
2560 clear_preempt_need_resched();
2433 rq->skip_clock_update = 0; 2561 rq->skip_clock_update = 0;
2434 2562
2435 if (likely(prev != next)) { 2563 if (likely(prev != next)) {
@@ -2520,9 +2648,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
2520 return; 2648 return;
2521 2649
2522 do { 2650 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE); 2651 __preempt_count_add(PREEMPT_ACTIVE);
2524 __schedule(); 2652 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2653 __preempt_count_sub(PREEMPT_ACTIVE);
2526 2654
2527 /* 2655 /*
2528 * Check again in case we missed a preemption opportunity 2656 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2669,19 @@ EXPORT_SYMBOL(preempt_schedule);
2541 */ 2669 */
2542asmlinkage void __sched preempt_schedule_irq(void) 2670asmlinkage void __sched preempt_schedule_irq(void)
2543{ 2671{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state; 2672 enum ctx_state prev_state;
2546 2673
2547 /* Catch callers which need to be fixed */ 2674 /* Catch callers which need to be fixed */
2548 BUG_ON(ti->preempt_count || !irqs_disabled()); 2675 BUG_ON(preempt_count() || !irqs_disabled());
2549 2676
2550 prev_state = exception_enter(); 2677 prev_state = exception_enter();
2551 2678
2552 do { 2679 do {
2553 add_preempt_count(PREEMPT_ACTIVE); 2680 __preempt_count_add(PREEMPT_ACTIVE);
2554 local_irq_enable(); 2681 local_irq_enable();
2555 __schedule(); 2682 __schedule();
2556 local_irq_disable(); 2683 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE); 2684 __preempt_count_sub(PREEMPT_ACTIVE);
2558 2685
2559 /* 2686 /*
2560 * Check again in case we missed a preemption opportunity 2687 * Check again in case we missed a preemption opportunity
@@ -2575,393 +2702,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2575} 2702}
2576EXPORT_SYMBOL(default_wake_function); 2703EXPORT_SYMBOL(default_wake_function);
2577 2704
2578/*
2579 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2580 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2581 * number) then we wake all the non-exclusive tasks and one exclusive task.
2582 *
2583 * There are circumstances in which we can try to wake a task which has already
2584 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2585 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2586 */
2587static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2588 int nr_exclusive, int wake_flags, void *key)
2589{
2590 wait_queue_t *curr, *next;
2591
2592 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
2593 unsigned flags = curr->flags;
2594
2595 if (curr->func(curr, mode, wake_flags, key) &&
2596 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
2597 break;
2598 }
2599}
2600
2601/**
2602 * __wake_up - wake up threads blocked on a waitqueue.
2603 * @q: the waitqueue
2604 * @mode: which threads
2605 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2606 * @key: is directly passed to the wakeup function
2607 *
2608 * It may be assumed that this function implies a write memory barrier before
2609 * changing the task state if and only if any tasks are woken up.
2610 */
2611void __wake_up(wait_queue_head_t *q, unsigned int mode,
2612 int nr_exclusive, void *key)
2613{
2614 unsigned long flags;
2615
2616 spin_lock_irqsave(&q->lock, flags);
2617 __wake_up_common(q, mode, nr_exclusive, 0, key);
2618 spin_unlock_irqrestore(&q->lock, flags);
2619}
2620EXPORT_SYMBOL(__wake_up);
2621
2622/*
2623 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2624 */
2625void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
2626{
2627 __wake_up_common(q, mode, nr, 0, NULL);
2628}
2629EXPORT_SYMBOL_GPL(__wake_up_locked);
2630
2631void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
2632{
2633 __wake_up_common(q, mode, 1, 0, key);
2634}
2635EXPORT_SYMBOL_GPL(__wake_up_locked_key);
2636
2637/**
2638 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
2639 * @q: the waitqueue
2640 * @mode: which threads
2641 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2642 * @key: opaque value to be passed to wakeup targets
2643 *
2644 * The sync wakeup differs that the waker knows that it will schedule
2645 * away soon, so while the target thread will be woken up, it will not
2646 * be migrated to another CPU - ie. the two threads are 'synchronized'
2647 * with each other. This can prevent needless bouncing between CPUs.
2648 *
2649 * On UP it can prevent extra preemption.
2650 *
2651 * It may be assumed that this function implies a write memory barrier before
2652 * changing the task state if and only if any tasks are woken up.
2653 */
2654void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2655 int nr_exclusive, void *key)
2656{
2657 unsigned long flags;
2658 int wake_flags = WF_SYNC;
2659
2660 if (unlikely(!q))
2661 return;
2662
2663 if (unlikely(nr_exclusive != 1))
2664 wake_flags = 0;
2665
2666 spin_lock_irqsave(&q->lock, flags);
2667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
2668 spin_unlock_irqrestore(&q->lock, flags);
2669}
2670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
2671
2672/*
2673 * __wake_up_sync - see __wake_up_sync_key()
2674 */
2675void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2676{
2677 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
2678}
2679EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2680
2681/**
2682 * complete: - signals a single thread waiting on this completion
2683 * @x: holds the state of this particular completion
2684 *
2685 * This will wake up a single thread waiting on this completion. Threads will be
2686 * awakened in the same order in which they were queued.
2687 *
2688 * See also complete_all(), wait_for_completion() and related routines.
2689 *
2690 * It may be assumed that this function implies a write memory barrier before
2691 * changing the task state if and only if any tasks are woken up.
2692 */
2693void complete(struct completion *x)
2694{
2695 unsigned long flags;
2696
2697 spin_lock_irqsave(&x->wait.lock, flags);
2698 x->done++;
2699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
2700 spin_unlock_irqrestore(&x->wait.lock, flags);
2701}
2702EXPORT_SYMBOL(complete);
2703
2704/**
2705 * complete_all: - signals all threads waiting on this completion
2706 * @x: holds the state of this particular completion
2707 *
2708 * This will wake up all threads waiting on this particular completion event.
2709 *
2710 * It may be assumed that this function implies a write memory barrier before
2711 * changing the task state if and only if any tasks are woken up.
2712 */
2713void complete_all(struct completion *x)
2714{
2715 unsigned long flags;
2716
2717 spin_lock_irqsave(&x->wait.lock, flags);
2718 x->done += UINT_MAX/2;
2719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
2720 spin_unlock_irqrestore(&x->wait.lock, flags);
2721}
2722EXPORT_SYMBOL(complete_all);
2723
2724static inline long __sched
2725do_wait_for_common(struct completion *x,
2726 long (*action)(long), long timeout, int state)
2727{
2728 if (!x->done) {
2729 DECLARE_WAITQUEUE(wait, current);
2730
2731 __add_wait_queue_tail_exclusive(&x->wait, &wait);
2732 do {
2733 if (signal_pending_state(state, current)) {
2734 timeout = -ERESTARTSYS;
2735 break;
2736 }
2737 __set_current_state(state);
2738 spin_unlock_irq(&x->wait.lock);
2739 timeout = action(timeout);
2740 spin_lock_irq(&x->wait.lock);
2741 } while (!x->done && timeout);
2742 __remove_wait_queue(&x->wait, &wait);
2743 if (!x->done)
2744 return timeout;
2745 }
2746 x->done--;
2747 return timeout ?: 1;
2748}
2749
2750static inline long __sched
2751__wait_for_common(struct completion *x,
2752 long (*action)(long), long timeout, int state)
2753{
2754 might_sleep();
2755
2756 spin_lock_irq(&x->wait.lock);
2757 timeout = do_wait_for_common(x, action, timeout, state);
2758 spin_unlock_irq(&x->wait.lock);
2759 return timeout;
2760}
2761
2762static long __sched
2763wait_for_common(struct completion *x, long timeout, int state)
2764{
2765 return __wait_for_common(x, schedule_timeout, timeout, state);
2766}
2767
2768static long __sched
2769wait_for_common_io(struct completion *x, long timeout, int state)
2770{
2771 return __wait_for_common(x, io_schedule_timeout, timeout, state);
2772}
2773
2774/**
2775 * wait_for_completion: - waits for completion of a task
2776 * @x: holds the state of this particular completion
2777 *
2778 * This waits to be signaled for completion of a specific task. It is NOT
2779 * interruptible and there is no timeout.
2780 *
2781 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
2782 * and interrupt capability. Also see complete().
2783 */
2784void __sched wait_for_completion(struct completion *x)
2785{
2786 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2787}
2788EXPORT_SYMBOL(wait_for_completion);
2789
2790/**
2791 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
2792 * @x: holds the state of this particular completion
2793 * @timeout: timeout value in jiffies
2794 *
2795 * This waits for either a completion of a specific task to be signaled or for a
2796 * specified timeout to expire. The timeout is in jiffies. It is not
2797 * interruptible.
2798 *
2799 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2800 * till timeout) if completed.
2801 */
2802unsigned long __sched
2803wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2804{
2805 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
2806}
2807EXPORT_SYMBOL(wait_for_completion_timeout);
2808
2809/**
2810 * wait_for_completion_io: - waits for completion of a task
2811 * @x: holds the state of this particular completion
2812 *
2813 * This waits to be signaled for completion of a specific task. It is NOT
2814 * interruptible and there is no timeout. The caller is accounted as waiting
2815 * for IO.
2816 */
2817void __sched wait_for_completion_io(struct completion *x)
2818{
2819 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2820}
2821EXPORT_SYMBOL(wait_for_completion_io);
2822
2823/**
2824 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
2825 * @x: holds the state of this particular completion
2826 * @timeout: timeout value in jiffies
2827 *
2828 * This waits for either a completion of a specific task to be signaled or for a
2829 * specified timeout to expire. The timeout is in jiffies. It is not
2830 * interruptible. The caller is accounted as waiting for IO.
2831 *
2832 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2833 * till timeout) if completed.
2834 */
2835unsigned long __sched
2836wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
2837{
2838 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
2839}
2840EXPORT_SYMBOL(wait_for_completion_io_timeout);
2841
2842/**
2843 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
2844 * @x: holds the state of this particular completion
2845 *
2846 * This waits for completion of a specific task to be signaled. It is
2847 * interruptible.
2848 *
2849 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2850 */
2851int __sched wait_for_completion_interruptible(struct completion *x)
2852{
2853 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
2854 if (t == -ERESTARTSYS)
2855 return t;
2856 return 0;
2857}
2858EXPORT_SYMBOL(wait_for_completion_interruptible);
2859
2860/**
2861 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
2862 * @x: holds the state of this particular completion
2863 * @timeout: timeout value in jiffies
2864 *
2865 * This waits for either a completion of a specific task to be signaled or for a
2866 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
2867 *
2868 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2869 * or number of jiffies left till timeout) if completed.
2870 */
2871long __sched
2872wait_for_completion_interruptible_timeout(struct completion *x,
2873 unsigned long timeout)
2874{
2875 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
2876}
2877EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2878
2879/**
2880 * wait_for_completion_killable: - waits for completion of a task (killable)
2881 * @x: holds the state of this particular completion
2882 *
2883 * This waits to be signaled for completion of a specific task. It can be
2884 * interrupted by a kill signal.
2885 *
2886 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2887 */
2888int __sched wait_for_completion_killable(struct completion *x)
2889{
2890 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
2891 if (t == -ERESTARTSYS)
2892 return t;
2893 return 0;
2894}
2895EXPORT_SYMBOL(wait_for_completion_killable);
2896
2897/**
2898 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
2899 * @x: holds the state of this particular completion
2900 * @timeout: timeout value in jiffies
2901 *
2902 * This waits for either a completion of a specific task to be
2903 * signaled or for a specified timeout to expire. It can be
2904 * interrupted by a kill signal. The timeout is in jiffies.
2905 *
2906 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2907 * or number of jiffies left till timeout) if completed.
2908 */
2909long __sched
2910wait_for_completion_killable_timeout(struct completion *x,
2911 unsigned long timeout)
2912{
2913 return wait_for_common(x, timeout, TASK_KILLABLE);
2914}
2915EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2916
2917/**
2918 * try_wait_for_completion - try to decrement a completion without blocking
2919 * @x: completion structure
2920 *
2921 * Return: 0 if a decrement cannot be done without blocking
2922 * 1 if a decrement succeeded.
2923 *
2924 * If a completion is being used as a counting completion,
2925 * attempt to decrement the counter without blocking. This
2926 * enables us to avoid waiting if the resource the completion
2927 * is protecting is not available.
2928 */
2929bool try_wait_for_completion(struct completion *x)
2930{
2931 unsigned long flags;
2932 int ret = 1;
2933
2934 spin_lock_irqsave(&x->wait.lock, flags);
2935 if (!x->done)
2936 ret = 0;
2937 else
2938 x->done--;
2939 spin_unlock_irqrestore(&x->wait.lock, flags);
2940 return ret;
2941}
2942EXPORT_SYMBOL(try_wait_for_completion);
2943
2944/**
2945 * completion_done - Test to see if a completion has any waiters
2946 * @x: completion structure
2947 *
2948 * Return: 0 if there are waiters (wait_for_completion() in progress)
2949 * 1 if there are no waiters.
2950 *
2951 */
2952bool completion_done(struct completion *x)
2953{
2954 unsigned long flags;
2955 int ret = 1;
2956
2957 spin_lock_irqsave(&x->wait.lock, flags);
2958 if (!x->done)
2959 ret = 0;
2960 spin_unlock_irqrestore(&x->wait.lock, flags);
2961 return ret;
2962}
2963EXPORT_SYMBOL(completion_done);
2964
2965static long __sched 2705static long __sched
2966sleep_on_common(wait_queue_head_t *q, int state, long timeout) 2706sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2967{ 2707{
@@ -3598,13 +3338,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3598 struct task_struct *p; 3338 struct task_struct *p;
3599 int retval; 3339 int retval;
3600 3340
3601 get_online_cpus();
3602 rcu_read_lock(); 3341 rcu_read_lock();
3603 3342
3604 p = find_process_by_pid(pid); 3343 p = find_process_by_pid(pid);
3605 if (!p) { 3344 if (!p) {
3606 rcu_read_unlock(); 3345 rcu_read_unlock();
3607 put_online_cpus();
3608 return -ESRCH; 3346 return -ESRCH;
3609 } 3347 }
3610 3348
@@ -3661,7 +3399,6 @@ out_free_cpus_allowed:
3661 free_cpumask_var(cpus_allowed); 3399 free_cpumask_var(cpus_allowed);
3662out_put_task: 3400out_put_task:
3663 put_task_struct(p); 3401 put_task_struct(p);
3664 put_online_cpus();
3665 return retval; 3402 return retval;
3666} 3403}
3667 3404
@@ -3706,7 +3443,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3706 unsigned long flags; 3443 unsigned long flags;
3707 int retval; 3444 int retval;
3708 3445
3709 get_online_cpus();
3710 rcu_read_lock(); 3446 rcu_read_lock();
3711 3447
3712 retval = -ESRCH; 3448 retval = -ESRCH;
@@ -3719,12 +3455,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3719 goto out_unlock; 3455 goto out_unlock;
3720 3456
3721 raw_spin_lock_irqsave(&p->pi_lock, flags); 3457 raw_spin_lock_irqsave(&p->pi_lock, flags);
3722 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3458 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3723 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3459 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3724 3460
3725out_unlock: 3461out_unlock:
3726 rcu_read_unlock(); 3462 rcu_read_unlock();
3727 put_online_cpus();
3728 3463
3729 return retval; 3464 return retval;
3730} 3465}
@@ -3794,16 +3529,11 @@ SYSCALL_DEFINE0(sched_yield)
3794 return 0; 3529 return 0;
3795} 3530}
3796 3531
3797static inline int should_resched(void)
3798{
3799 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3800}
3801
3802static void __cond_resched(void) 3532static void __cond_resched(void)
3803{ 3533{
3804 add_preempt_count(PREEMPT_ACTIVE); 3534 __preempt_count_add(PREEMPT_ACTIVE);
3805 __schedule(); 3535 __schedule();
3806 sub_preempt_count(PREEMPT_ACTIVE); 3536 __preempt_count_sub(PREEMPT_ACTIVE);
3807} 3537}
3808 3538
3809int __sched _cond_resched(void) 3539int __sched _cond_resched(void)
@@ -4186,7 +3916,7 @@ void init_idle(struct task_struct *idle, int cpu)
4186 3916
4187 raw_spin_lock_irqsave(&rq->lock, flags); 3917 raw_spin_lock_irqsave(&rq->lock, flags);
4188 3918
4189 __sched_fork(idle); 3919 __sched_fork(0, idle);
4190 idle->state = TASK_RUNNING; 3920 idle->state = TASK_RUNNING;
4191 idle->se.exec_start = sched_clock(); 3921 idle->se.exec_start = sched_clock();
4192 3922
@@ -4212,7 +3942,7 @@ void init_idle(struct task_struct *idle, int cpu)
4212 raw_spin_unlock_irqrestore(&rq->lock, flags); 3942 raw_spin_unlock_irqrestore(&rq->lock, flags);
4213 3943
4214 /* Set the preempt count _outside_ the spinlocks! */ 3944 /* Set the preempt count _outside_ the spinlocks! */
4215 task_thread_info(idle)->preempt_count = 0; 3945 init_idle_preempt_count(idle, cpu);
4216 3946
4217 /* 3947 /*
4218 * The idle tasks have their own, simple scheduling class: 3948 * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4076,53 @@ fail:
4346 return ret; 4076 return ret;
4347} 4077}
4348 4078
4079#ifdef CONFIG_NUMA_BALANCING
4080/* Migrate current task p to target_cpu */
4081int migrate_task_to(struct task_struct *p, int target_cpu)
4082{
4083 struct migration_arg arg = { p, target_cpu };
4084 int curr_cpu = task_cpu(p);
4085
4086 if (curr_cpu == target_cpu)
4087 return 0;
4088
4089 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4090 return -EINVAL;
4091
4092 /* TODO: This is not properly updating schedstats */
4093
4094 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4095}
4096
4097/*
4098 * Requeue a task on a given node and accurately track the number of NUMA
4099 * tasks on the runqueues
4100 */
4101void sched_setnuma(struct task_struct *p, int nid)
4102{
4103 struct rq *rq;
4104 unsigned long flags;
4105 bool on_rq, running;
4106
4107 rq = task_rq_lock(p, &flags);
4108 on_rq = p->on_rq;
4109 running = task_current(rq, p);
4110
4111 if (on_rq)
4112 dequeue_task(rq, p, 0);
4113 if (running)
4114 p->sched_class->put_prev_task(rq, p);
4115
4116 p->numa_preferred_nid = nid;
4117
4118 if (running)
4119 p->sched_class->set_curr_task(rq);
4120 if (on_rq)
4121 enqueue_task(rq, p, 0);
4122 task_rq_unlock(rq, p, &flags);
4123}
4124#endif
4125
4349/* 4126/*
4350 * migration_cpu_stop - this will be executed by a highprio stopper thread 4127 * migration_cpu_stop - this will be executed by a highprio stopper thread
4351 * and performs thread migration by bumping thread off CPU then 4128 * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +4896,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5119DEFINE_PER_CPU(struct sched_domain *, sd_llc); 4896DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size); 4897DEFINE_PER_CPU(int, sd_llc_size);
5121DEFINE_PER_CPU(int, sd_llc_id); 4898DEFINE_PER_CPU(int, sd_llc_id);
4899DEFINE_PER_CPU(struct sched_domain *, sd_numa);
4900DEFINE_PER_CPU(struct sched_domain *, sd_busy);
4901DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5122 4902
5123static void update_top_cache_domain(int cpu) 4903static void update_top_cache_domain(int cpu)
5124{ 4904{
@@ -5130,11 +4910,18 @@ static void update_top_cache_domain(int cpu)
5130 if (sd) { 4910 if (sd) {
5131 id = cpumask_first(sched_domain_span(sd)); 4911 id = cpumask_first(sched_domain_span(sd));
5132 size = cpumask_weight(sched_domain_span(sd)); 4912 size = cpumask_weight(sched_domain_span(sd));
4913 rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
5133 } 4914 }
5134 4915
5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 4916 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size; 4917 per_cpu(sd_llc_size, cpu) = size;
5137 per_cpu(sd_llc_id, cpu) = id; 4918 per_cpu(sd_llc_id, cpu) = id;
4919
4920 sd = lowest_flag_domain(cpu, SD_NUMA);
4921 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
4922
4923 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
4924 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5138} 4925}
5139 4926
5140/* 4927/*
@@ -5654,6 +5441,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5654 | 0*SD_SHARE_PKG_RESOURCES 5441 | 0*SD_SHARE_PKG_RESOURCES
5655 | 1*SD_SERIALIZE 5442 | 1*SD_SERIALIZE
5656 | 0*SD_PREFER_SIBLING 5443 | 0*SD_PREFER_SIBLING
5444 | 1*SD_NUMA
5657 | sd_local_flags(level) 5445 | sd_local_flags(level)
5658 , 5446 ,
5659 .last_balance = jiffies, 5447 .last_balance = jiffies,
@@ -6335,14 +6123,17 @@ void __init sched_init_smp(void)
6335 6123
6336 sched_init_numa(); 6124 sched_init_numa();
6337 6125
6338 get_online_cpus(); 6126 /*
6127 * There's no userspace yet to cause hotplug operations; hence all the
6128 * cpu masks are stable and all blatant races in the below code cannot
6129 * happen.
6130 */
6339 mutex_lock(&sched_domains_mutex); 6131 mutex_lock(&sched_domains_mutex);
6340 init_sched_domains(cpu_active_mask); 6132 init_sched_domains(cpu_active_mask);
6341 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6133 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6342 if (cpumask_empty(non_isolated_cpus)) 6134 if (cpumask_empty(non_isolated_cpus))
6343 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6135 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6344 mutex_unlock(&sched_domains_mutex); 6136 mutex_unlock(&sched_domains_mutex);
6345 put_online_cpus();
6346 6137
6347 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6138 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6348 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6139 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6296,7 @@ void __init sched_init(void)
6505 rq->online = 0; 6296 rq->online = 0;
6506 rq->idle_stamp = 0; 6297 rq->idle_stamp = 0;
6507 rq->avg_idle = 2*sysctl_sched_migration_cost; 6298 rq->avg_idle = 2*sysctl_sched_migration_cost;
6299 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6508 6300
6509 INIT_LIST_HEAD(&rq->cfs_tasks); 6301 INIT_LIST_HEAD(&rq->cfs_tasks);
6510 6302
@@ -7277,7 +7069,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7277 7069
7278 runtime_enabled = quota != RUNTIME_INF; 7070 runtime_enabled = quota != RUNTIME_INF;
7279 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7071 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7280 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7072 /*
7073 * If we need to toggle cfs_bandwidth_used, off->on must occur
7074 * before making related changes, and on->off must occur afterwards
7075 */
7076 if (runtime_enabled && !runtime_was_enabled)
7077 cfs_bandwidth_usage_inc();
7281 raw_spin_lock_irq(&cfs_b->lock); 7078 raw_spin_lock_irq(&cfs_b->lock);
7282 cfs_b->period = ns_to_ktime(period); 7079 cfs_b->period = ns_to_ktime(period);
7283 cfs_b->quota = quota; 7080 cfs_b->quota = quota;
@@ -7303,6 +7100,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7303 unthrottle_cfs_rq(cfs_rq); 7100 unthrottle_cfs_rq(cfs_rq);
7304 raw_spin_unlock_irq(&rq->lock); 7101 raw_spin_unlock_irq(&rq->lock);
7305 } 7102 }
7103 if (runtime_was_enabled && !runtime_enabled)
7104 cfs_bandwidth_usage_dec();
7306out_unlock: 7105out_unlock:
7307 mutex_unlock(&cfs_constraints_mutex); 7106 mutex_unlock(&cfs_constraints_mutex);
7308 7107
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bddd4c66..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h>
18 19
19#include "sched.h" 20#include "sched.h"
20 21
@@ -124,7 +125,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
124 SEQ_printf(m, " "); 125 SEQ_printf(m, " ");
125 126
126 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", 127 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
127 p->comm, p->pid, 128 p->comm, task_pid_nr(p),
128 SPLIT_NS(p->se.vruntime), 129 SPLIT_NS(p->se.vruntime),
129 (long long)(p->nvcsw + p->nivcsw), 130 (long long)(p->nvcsw + p->nivcsw),
130 p->prio); 131 p->prio);
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
137 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 138 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
138 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
139#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
143#endif
140#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
141 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
142#endif 146#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 read_lock_irqsave(&tasklist_lock, flags); 163 read_lock_irqsave(&tasklist_lock, flags);
160 164
161 do_each_thread(g, p) { 165 do_each_thread(g, p) {
162 if (!p->on_rq || task_cpu(p) != rq_cpu) 166 if (task_cpu(p) != rq_cpu)
163 continue; 167 continue;
164 168
165 print_task(m, rq, p); 169 print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
225 atomic_read(&cfs_rq->tg->runnable_avg)); 229 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 230#endif
227#endif 231#endif
232#ifdef CONFIG_CFS_BANDWIDTH
233 SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
234 cfs_rq->tg->cfs_bandwidth.timer_active);
235 SEQ_printf(m, " .%-30s: %d\n", "throttled",
236 cfs_rq->throttled);
237 SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
238 cfs_rq->throttle_count);
239#endif
228 240
229#ifdef CONFIG_FAIR_GROUP_SCHED 241#ifdef CONFIG_FAIR_GROUP_SCHED
230 print_cfs_group_stats(m, cpu, cfs_rq->tg); 242 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -289,7 +301,7 @@ do { \
289 P(nr_load_updates); 301 P(nr_load_updates);
290 P(nr_uninterruptible); 302 P(nr_uninterruptible);
291 PN(next_balance); 303 PN(next_balance);
292 P(curr->pid); 304 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
293 PN(clock); 305 PN(clock);
294 P(cpu_load[0]); 306 P(cpu_load[0]);
295 P(cpu_load[1]); 307 P(cpu_load[1]);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
345 cpu_clk = local_clock(); 357 cpu_clk = local_clock();
346 local_irq_restore(flags); 358 local_irq_restore(flags);
347 359
348 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", 360 SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
349 init_utsname()->release, 361 init_utsname()->release,
350 (int)strcspn(init_utsname()->version, " "), 362 (int)strcspn(init_utsname()->version, " "),
351 init_utsname()->version); 363 init_utsname()->version);
@@ -488,11 +500,61 @@ static int __init init_sched_debug_procfs(void)
488 500
489__initcall(init_sched_debug_procfs); 501__initcall(init_sched_debug_procfs);
490 502
503#define __P(F) \
504 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
505#define P(F) \
506 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
507#define __PN(F) \
508 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
509#define PN(F) \
510 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
511
512
513static void sched_show_numa(struct task_struct *p, struct seq_file *m)
514{
515#ifdef CONFIG_NUMA_BALANCING
516 struct mempolicy *pol;
517 int node, i;
518
519 if (p->mm)
520 P(mm->numa_scan_seq);
521
522 task_lock(p);
523 pol = p->mempolicy;
524 if (pol && !(pol->flags & MPOL_F_MORON))
525 pol = NULL;
526 mpol_get(pol);
527 task_unlock(p);
528
529 SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
530
531 for_each_online_node(node) {
532 for (i = 0; i < 2; i++) {
533 unsigned long nr_faults = -1;
534 int cpu_current, home_node;
535
536 if (p->numa_faults)
537 nr_faults = p->numa_faults[2*node + i];
538
539 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes));
541
542 home_node = (p->numa_preferred_nid == node);
543
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults);
546 }
547 }
548
549 mpol_put(pol);
550#endif
551}
552
491void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 553void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 554{
493 unsigned long nr_switches; 555 unsigned long nr_switches;
494 556
495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 557 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
496 get_nr_threads(p)); 558 get_nr_threads(p));
497 SEQ_printf(m, 559 SEQ_printf(m,
498 "---------------------------------------------------------" 560 "---------------------------------------------------------"
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
591 SEQ_printf(m, "%-45s:%21Ld\n", 653 SEQ_printf(m, "%-45s:%21Ld\n",
592 "clock-delta", (long long)(t1-t0)); 654 "clock-delta", (long long)(t1-t0));
593 } 655 }
656
657 sched_show_numa(p, m);
594} 658}
595 659
596void proc_sched_set_task(struct task_struct *p) 660void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7f0a5e6cdae0..e8b652ebe027 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 820
819#ifdef CONFIG_NUMA_BALANCING 821#ifdef CONFIG_NUMA_BALANCING
820/* 822/*
821 * numa task sample period in ms 823 * Approximate time to scan a full NUMA task in ms. The task scan period is
824 * calculated based on the tasks virtual memory size and
825 * numa_balancing_scan_size.
822 */ 826 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 827unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 828unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 829
827/* Portion of address space to scan in MB */ 830/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 831unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,835 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 835
833static void task_numa_placement(struct task_struct *p) 836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
844static unsigned int task_nr_scan_windows(struct task_struct *p)
845{
846 unsigned long rss = 0;
847 unsigned long nr_scan_pages;
848
849 /*
850 * Calculations based on RSS as non-present and empty pages are skipped
851 * by the PTE scanner and NUMA hinting faults should be trapped based
852 * on resident pages
853 */
854 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
855 rss = get_mm_rss(p->mm);
856 if (!rss)
857 rss = nr_scan_pages;
858
859 rss = round_up(rss, nr_scan_pages);
860 return rss / nr_scan_pages;
861}
862
863/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
864#define MAX_SCAN_WINDOW 2560
865
866static unsigned int task_scan_min(struct task_struct *p)
867{
868 unsigned int scan, floor;
869 unsigned int windows = 1;
870
871 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
872 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
873 floor = 1000 / windows;
874
875 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
876 return max_t(unsigned int, floor, scan);
877}
878
879static unsigned int task_scan_max(struct task_struct *p)
880{
881 unsigned int smin = task_scan_min(p);
882 unsigned int smax;
883
884 /* Watch for min being lower than max due to floor calculations */
885 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
886 return max(smin, smax);
887}
888
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1);
901 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
902}
903
904static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
905{
906 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
907 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
908}
909
910struct numa_group {
911 atomic_t refcount;
912
913 spinlock_t lock; /* nr_tasks, tasks */
914 int nr_tasks;
915 pid_t gid;
916 struct list_head task_list;
917
918 struct rcu_head rcu;
919 unsigned long total_faults;
920 unsigned long faults[0];
921};
922
923pid_t task_numa_group_id(struct task_struct *p)
924{
925 return p->numa_group ? p->numa_group->gid : 0;
926}
927
928static inline int task_faults_idx(int nid, int priv)
929{
930 return 2 * nid + priv;
931}
932
933static inline unsigned long task_faults(struct task_struct *p, int nid)
834{ 934{
835 int seq; 935 if (!p->numa_faults)
936 return 0;
937
938 return p->numa_faults[task_faults_idx(nid, 0)] +
939 p->numa_faults[task_faults_idx(nid, 1)];
940}
941
942static inline unsigned long group_faults(struct task_struct *p, int nid)
943{
944 if (!p->numa_group)
945 return 0;
946
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
948}
949
950/*
951 * These return the fraction of accesses done by a particular task, or
952 * task group, on a particular numa node. The group weight is given a
953 * larger multiplier, in order to group tasks together that are almost
954 * evenly spread out between numa nodes.
955 */
956static inline unsigned long task_weight(struct task_struct *p, int nid)
957{
958 unsigned long total_faults;
959
960 if (!p->numa_faults)
961 return 0;
962
963 total_faults = p->total_numa_faults;
964
965 if (!total_faults)
966 return 0;
967
968 return 1000 * task_faults(p, nid) / total_faults;
969}
970
971static inline unsigned long group_weight(struct task_struct *p, int nid)
972{
973 if (!p->numa_group || !p->numa_group->total_faults)
974 return 0;
975
976 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
977}
836 978
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */ 979static unsigned long weighted_cpuload(const int cpu);
980static unsigned long source_load(int cpu, int type);
981static unsigned long target_load(int cpu, int type);
982static unsigned long power_of(int cpu);
983static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
984
985/* Cached statistics for all CPUs within a node */
986struct numa_stats {
987 unsigned long nr_running;
988 unsigned long load;
989
990 /* Total compute capacity of CPUs on a node */
991 unsigned long power;
992
993 /* Approximate capacity in terms of runnable tasks on a node */
994 unsigned long capacity;
995 int has_capacity;
996};
997
998/*
999 * XXX borrowed from update_sg_lb_stats
1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{
1003 int cpu, cpus = 0;
1004
1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 struct rq *rq = cpu_rq(cpu);
1008
1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu);
1012
1013 cpus++;
1014 }
1015
1016 /*
1017 * If we raced with hotplug and there are no CPUs left in our mask
1018 * the @ns structure is NULL'ed and task_numa_compare() will
1019 * not find this node attractive.
1020 *
1021 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
1022 * and bail there.
1023 */
1024 if (!cpus)
838 return; 1025 return;
1026
1027 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1028 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1029 ns->has_capacity = (ns->nr_running < ns->capacity);
1030}
1031
1032struct task_numa_env {
1033 struct task_struct *p;
1034
1035 int src_cpu, src_nid;
1036 int dst_cpu, dst_nid;
1037
1038 struct numa_stats src_stats, dst_stats;
1039
1040 int imbalance_pct, idx;
1041
1042 struct task_struct *best_task;
1043 long best_imp;
1044 int best_cpu;
1045};
1046
1047static void task_numa_assign(struct task_numa_env *env,
1048 struct task_struct *p, long imp)
1049{
1050 if (env->best_task)
1051 put_task_struct(env->best_task);
1052 if (p)
1053 get_task_struct(p);
1054
1055 env->best_task = p;
1056 env->best_imp = imp;
1057 env->best_cpu = env->dst_cpu;
1058}
1059
1060/*
1061 * This checks if the overall compute and NUMA accesses of the system would
1062 * be improved if the source tasks was migrated to the target dst_cpu taking
1063 * into account that it might be best if task running on the dst_cpu should
1064 * be exchanged with the source task
1065 */
1066static void task_numa_compare(struct task_numa_env *env,
1067 long taskimp, long groupimp)
1068{
1069 struct rq *src_rq = cpu_rq(env->src_cpu);
1070 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1071 struct task_struct *cur;
1072 long dst_load, src_load;
1073 long load;
1074 long imp = (groupimp > 0) ? groupimp : taskimp;
1075
1076 rcu_read_lock();
1077 cur = ACCESS_ONCE(dst_rq->curr);
1078 if (cur->pid == 0) /* idle */
1079 cur = NULL;
1080
1081 /*
1082 * "imp" is the fault differential for the source task between the
1083 * source and destination node. Calculate the total differential for
1084 * the source task and potential destination task. The more negative
1085 * the value is, the more rmeote accesses that would be expected to
1086 * be incurred if the tasks were swapped.
1087 */
1088 if (cur) {
1089 /* Skip this swap candidate if cannot move to the source cpu */
1090 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1091 goto unlock;
1092
1093 /*
1094 * If dst and source tasks are in the same NUMA group, or not
1095 * in any group then look only at task weights.
1096 */
1097 if (cur->numa_group == env->p->numa_group) {
1098 imp = taskimp + task_weight(cur, env->src_nid) -
1099 task_weight(cur, env->dst_nid);
1100 /*
1101 * Add some hysteresis to prevent swapping the
1102 * tasks within a group over tiny differences.
1103 */
1104 if (cur->numa_group)
1105 imp -= imp/16;
1106 } else {
1107 /*
1108 * Compare the group weights. If a task is all by
1109 * itself (not part of a group), use the task weight
1110 * instead.
1111 */
1112 if (env->p->numa_group)
1113 imp = groupimp;
1114 else
1115 imp = taskimp;
1116
1117 if (cur->numa_group)
1118 imp += group_weight(cur, env->src_nid) -
1119 group_weight(cur, env->dst_nid);
1120 else
1121 imp += task_weight(cur, env->src_nid) -
1122 task_weight(cur, env->dst_nid);
1123 }
1124 }
1125
1126 if (imp < env->best_imp)
1127 goto unlock;
1128
1129 if (!cur) {
1130 /* Is there capacity at our destination? */
1131 if (env->src_stats.has_capacity &&
1132 !env->dst_stats.has_capacity)
1133 goto unlock;
1134
1135 goto balance;
1136 }
1137
1138 /* Balance doesn't matter much if we're running a task per cpu */
1139 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1140 goto assign;
1141
1142 /*
1143 * In the overloaded case, try and keep the load balanced.
1144 */
1145balance:
1146 dst_load = env->dst_stats.load;
1147 src_load = env->src_stats.load;
1148
1149 /* XXX missing power terms */
1150 load = task_h_load(env->p);
1151 dst_load += load;
1152 src_load -= load;
1153
1154 if (cur) {
1155 load = task_h_load(cur);
1156 dst_load -= load;
1157 src_load += load;
1158 }
1159
1160 /* make src_load the smaller */
1161 if (dst_load < src_load)
1162 swap(dst_load, src_load);
1163
1164 if (src_load * env->imbalance_pct < dst_load * 100)
1165 goto unlock;
1166
1167assign:
1168 task_numa_assign(env, cur, imp);
1169unlock:
1170 rcu_read_unlock();
1171}
1172
1173static void task_numa_find_cpu(struct task_numa_env *env,
1174 long taskimp, long groupimp)
1175{
1176 int cpu;
1177
1178 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1179 /* Skip this CPU if the source task cannot migrate */
1180 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1181 continue;
1182
1183 env->dst_cpu = cpu;
1184 task_numa_compare(env, taskimp, groupimp);
1185 }
1186}
1187
1188static int task_numa_migrate(struct task_struct *p)
1189{
1190 struct task_numa_env env = {
1191 .p = p,
1192
1193 .src_cpu = task_cpu(p),
1194 .src_nid = task_node(p),
1195
1196 .imbalance_pct = 112,
1197
1198 .best_task = NULL,
1199 .best_imp = 0,
1200 .best_cpu = -1
1201 };
1202 struct sched_domain *sd;
1203 unsigned long taskweight, groupweight;
1204 int nid, ret;
1205 long taskimp, groupimp;
1206
1207 /*
1208 * Pick the lowest SD_NUMA domain, as that would have the smallest
1209 * imbalance and would be the first to start moving tasks about.
1210 *
1211 * And we want to avoid any moving of tasks about, as that would create
1212 * random movement of tasks -- counter the numa conditions we're trying
1213 * to satisfy here.
1214 */
1215 rcu_read_lock();
1216 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1217 if (sd)
1218 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1219 rcu_read_unlock();
1220
1221 /*
1222 * Cpusets can break the scheduler domain tree into smaller
1223 * balance domains, some of which do not cross NUMA boundaries.
1224 * Tasks that are "trapped" in such domains cannot be migrated
1225 * elsewhere, so there is no point in (re)trying.
1226 */
1227 if (unlikely(!sd)) {
1228 p->numa_preferred_nid = cpu_to_node(task_cpu(p));
1229 return -EINVAL;
1230 }
1231
1232 taskweight = task_weight(p, env.src_nid);
1233 groupweight = group_weight(p, env.src_nid);
1234 update_numa_stats(&env.src_stats, env.src_nid);
1235 env.dst_nid = p->numa_preferred_nid;
1236 taskimp = task_weight(p, env.dst_nid) - taskweight;
1237 groupimp = group_weight(p, env.dst_nid) - groupweight;
1238 update_numa_stats(&env.dst_stats, env.dst_nid);
1239
1240 /* If the preferred nid has capacity, try to use it. */
1241 if (env.dst_stats.has_capacity)
1242 task_numa_find_cpu(&env, taskimp, groupimp);
1243
1244 /* No space available on the preferred nid. Look elsewhere. */
1245 if (env.best_cpu == -1) {
1246 for_each_online_node(nid) {
1247 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1248 continue;
1249
1250 /* Only consider nodes where both task and groups benefit */
1251 taskimp = task_weight(p, nid) - taskweight;
1252 groupimp = group_weight(p, nid) - groupweight;
1253 if (taskimp < 0 && groupimp < 0)
1254 continue;
1255
1256 env.dst_nid = nid;
1257 update_numa_stats(&env.dst_stats, env.dst_nid);
1258 task_numa_find_cpu(&env, taskimp, groupimp);
1259 }
1260 }
1261
1262 /* No better CPU than the current one was found. */
1263 if (env.best_cpu == -1)
1264 return -EAGAIN;
1265
1266 sched_setnuma(p, env.dst_nid);
1267
1268 /*
1269 * Reset the scan period if the task is being rescheduled on an
1270 * alternative node to recheck if the tasks is now properly placed.
1271 */
1272 p->numa_scan_period = task_scan_min(p);
1273
1274 if (env.best_task == NULL) {
1275 int ret = migrate_task_to(p, env.best_cpu);
1276 return ret;
1277 }
1278
1279 ret = migrate_swap(p, env.best_task);
1280 put_task_struct(env.best_task);
1281 return ret;
1282}
1283
1284/* Attempt to migrate a task to a CPU on the preferred node. */
1285static void numa_migrate_preferred(struct task_struct *p)
1286{
1287 /* This task has no NUMA fault statistics yet */
1288 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1289 return;
1290
1291 /* Periodically retry migrating the task to the preferred node */
1292 p->numa_migrate_retry = jiffies + HZ;
1293
1294 /* Success if task is already running on preferred CPU */
1295 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1296 return;
1297
1298 /* Otherwise, try migrate to a CPU on the preferred node */
1299 task_numa_migrate(p);
1300}
1301
1302/*
1303 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1304 * increments. The more local the fault statistics are, the higher the scan
1305 * period will be for the next scan window. If local/remote ratio is below
1306 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1307 * scan period will decrease
1308 */
1309#define NUMA_PERIOD_SLOTS 10
1310#define NUMA_PERIOD_THRESHOLD 3
1311
1312/*
1313 * Increase the scan period (slow down scanning) if the majority of
1314 * our memory is already on our local node, or if the majority of
1315 * the page accesses are shared with other processes.
1316 * Otherwise, decrease the scan period.
1317 */
1318static void update_task_scan_period(struct task_struct *p,
1319 unsigned long shared, unsigned long private)
1320{
1321 unsigned int period_slot;
1322 int ratio;
1323 int diff;
1324
1325 unsigned long remote = p->numa_faults_locality[0];
1326 unsigned long local = p->numa_faults_locality[1];
1327
1328 /*
1329 * If there were no record hinting faults then either the task is
1330 * completely idle or all activity is areas that are not of interest
1331 * to automatic numa balancing. Scan slower
1332 */
1333 if (local + shared == 0) {
1334 p->numa_scan_period = min(p->numa_scan_period_max,
1335 p->numa_scan_period << 1);
1336
1337 p->mm->numa_next_scan = jiffies +
1338 msecs_to_jiffies(p->numa_scan_period);
1339
1340 return;
1341 }
1342
1343 /*
1344 * Prepare to scale scan period relative to the current period.
1345 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1346 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1347 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1348 */
1349 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1350 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1351 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1352 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1353 if (!slot)
1354 slot = 1;
1355 diff = slot * period_slot;
1356 } else {
1357 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1358
1359 /*
1360 * Scale scan rate increases based on sharing. There is an
1361 * inverse relationship between the degree of sharing and
1362 * the adjustment made to the scanning period. Broadly
1363 * speaking the intent is that there is little point
1364 * scanning faster if shared accesses dominate as it may
1365 * simply bounce migrations uselessly
1366 */
1367 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1368 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1369 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1370 }
1371
1372 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1373 task_scan_min(p), task_scan_max(p));
1374 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1375}
1376
1377static void task_numa_placement(struct task_struct *p)
1378{
1379 int seq, nid, max_nid = -1, max_group_nid = -1;
1380 unsigned long max_faults = 0, max_group_faults = 0;
1381 unsigned long fault_types[2] = { 0, 0 };
1382 spinlock_t *group_lock = NULL;
1383
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1384 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1385 if (p->numa_scan_seq == seq)
841 return; 1386 return;
842 p->numa_scan_seq = seq; 1387 p->numa_scan_seq = seq;
1388 p->numa_scan_period_max = task_scan_max(p);
1389
1390 /* If the task is part of a group prevent parallel updates to group stats */
1391 if (p->numa_group) {
1392 group_lock = &p->numa_group->lock;
1393 spin_lock(group_lock);
1394 }
1395
1396 /* Find the node with the highest number of faults */
1397 for_each_online_node(nid) {
1398 unsigned long faults = 0, group_faults = 0;
1399 int priv, i;
1400
1401 for (priv = 0; priv < 2; priv++) {
1402 long diff;
1403
1404 i = task_faults_idx(nid, priv);
1405 diff = -p->numa_faults[i];
1406
1407 /* Decay existing window, copy faults since last scan */
1408 p->numa_faults[i] >>= 1;
1409 p->numa_faults[i] += p->numa_faults_buffer[i];
1410 fault_types[priv] += p->numa_faults_buffer[i];
1411 p->numa_faults_buffer[i] = 0;
1412
1413 faults += p->numa_faults[i];
1414 diff += p->numa_faults[i];
1415 p->total_numa_faults += diff;
1416 if (p->numa_group) {
1417 /* safe because we can only change our own group */
1418 p->numa_group->faults[i] += diff;
1419 p->numa_group->total_faults += diff;
1420 group_faults += p->numa_group->faults[i];
1421 }
1422 }
1423
1424 if (faults > max_faults) {
1425 max_faults = faults;
1426 max_nid = nid;
1427 }
1428
1429 if (group_faults > max_group_faults) {
1430 max_group_faults = group_faults;
1431 max_group_nid = nid;
1432 }
1433 }
1434
1435 update_task_scan_period(p, fault_types[0], fault_types[1]);
1436
1437 if (p->numa_group) {
1438 /*
1439 * If the preferred task and group nids are different,
1440 * iterate over the nodes again to find the best place.
1441 */
1442 if (max_nid != max_group_nid) {
1443 unsigned long weight, max_weight = 0;
1444
1445 for_each_online_node(nid) {
1446 weight = task_weight(p, nid) + group_weight(p, nid);
1447 if (weight > max_weight) {
1448 max_weight = weight;
1449 max_nid = nid;
1450 }
1451 }
1452 }
1453
1454 spin_unlock(group_lock);
1455 }
1456
1457 /* Preferred node as the node with the most faults */
1458 if (max_faults && max_nid != p->numa_preferred_nid) {
1459 /* Update the preferred nid and migrate task if possible */
1460 sched_setnuma(p, max_nid);
1461 numa_migrate_preferred(p);
1462 }
1463}
1464
1465static inline int get_numa_group(struct numa_group *grp)
1466{
1467 return atomic_inc_not_zero(&grp->refcount);
1468}
1469
1470static inline void put_numa_group(struct numa_group *grp)
1471{
1472 if (atomic_dec_and_test(&grp->refcount))
1473 kfree_rcu(grp, rcu);
1474}
1475
1476static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1477 int *priv)
1478{
1479 struct numa_group *grp, *my_grp;
1480 struct task_struct *tsk;
1481 bool join = false;
1482 int cpu = cpupid_to_cpu(cpupid);
1483 int i;
1484
1485 if (unlikely(!p->numa_group)) {
1486 unsigned int size = sizeof(struct numa_group) +
1487 2*nr_node_ids*sizeof(unsigned long);
1488
1489 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1490 if (!grp)
1491 return;
1492
1493 atomic_set(&grp->refcount, 1);
1494 spin_lock_init(&grp->lock);
1495 INIT_LIST_HEAD(&grp->task_list);
1496 grp->gid = p->pid;
1497
1498 for (i = 0; i < 2*nr_node_ids; i++)
1499 grp->faults[i] = p->numa_faults[i];
1500
1501 grp->total_faults = p->total_numa_faults;
1502
1503 list_add(&p->numa_entry, &grp->task_list);
1504 grp->nr_tasks++;
1505 rcu_assign_pointer(p->numa_group, grp);
1506 }
1507
1508 rcu_read_lock();
1509 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1510
1511 if (!cpupid_match_pid(tsk, cpupid))
1512 goto no_join;
1513
1514 grp = rcu_dereference(tsk->numa_group);
1515 if (!grp)
1516 goto no_join;
1517
1518 my_grp = p->numa_group;
1519 if (grp == my_grp)
1520 goto no_join;
1521
1522 /*
1523 * Only join the other group if its bigger; if we're the bigger group,
1524 * the other task will join us.
1525 */
1526 if (my_grp->nr_tasks > grp->nr_tasks)
1527 goto no_join;
1528
1529 /*
1530 * Tie-break on the grp address.
1531 */
1532 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1533 goto no_join;
1534
1535 /* Always join threads in the same process. */
1536 if (tsk->mm == current->mm)
1537 join = true;
843 1538
844 /* FIXME: Scheduling placement policy hints go here */ 1539 /* Simple filter to avoid false positives due to PID collisions */
1540 if (flags & TNF_SHARED)
1541 join = true;
1542
1543 /* Update priv based on whether false sharing was detected */
1544 *priv = !join;
1545
1546 if (join && !get_numa_group(grp))
1547 goto no_join;
1548
1549 rcu_read_unlock();
1550
1551 if (!join)
1552 return;
1553
1554 double_lock(&my_grp->lock, &grp->lock);
1555
1556 for (i = 0; i < 2*nr_node_ids; i++) {
1557 my_grp->faults[i] -= p->numa_faults[i];
1558 grp->faults[i] += p->numa_faults[i];
1559 }
1560 my_grp->total_faults -= p->total_numa_faults;
1561 grp->total_faults += p->total_numa_faults;
1562
1563 list_move(&p->numa_entry, &grp->task_list);
1564 my_grp->nr_tasks--;
1565 grp->nr_tasks++;
1566
1567 spin_unlock(&my_grp->lock);
1568 spin_unlock(&grp->lock);
1569
1570 rcu_assign_pointer(p->numa_group, grp);
1571
1572 put_numa_group(my_grp);
1573 return;
1574
1575no_join:
1576 rcu_read_unlock();
1577 return;
1578}
1579
1580void task_numa_free(struct task_struct *p)
1581{
1582 struct numa_group *grp = p->numa_group;
1583 int i;
1584 void *numa_faults = p->numa_faults;
1585
1586 if (grp) {
1587 spin_lock(&grp->lock);
1588 for (i = 0; i < 2*nr_node_ids; i++)
1589 grp->faults[i] -= p->numa_faults[i];
1590 grp->total_faults -= p->total_numa_faults;
1591
1592 list_del(&p->numa_entry);
1593 grp->nr_tasks--;
1594 spin_unlock(&grp->lock);
1595 rcu_assign_pointer(p->numa_group, NULL);
1596 put_numa_group(grp);
1597 }
1598
1599 p->numa_faults = NULL;
1600 p->numa_faults_buffer = NULL;
1601 kfree(numa_faults);
845} 1602}
846 1603
847/* 1604/*
848 * Got a PROT_NONE fault for a page on @node. 1605 * Got a PROT_NONE fault for a page on @node.
849 */ 1606 */
850void task_numa_fault(int node, int pages, bool migrated) 1607void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1608{
852 struct task_struct *p = current; 1609 struct task_struct *p = current;
1610 bool migrated = flags & TNF_MIGRATED;
1611 int priv;
853 1612
854 if (!numabalancing_enabled) 1613 if (!numabalancing_enabled)
855 return; 1614 return;
856 1615
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1616 /* for example, ksmd faulting in a user's mm */
1617 if (!p->mm)
1618 return;
1619
1620 /* Do not worry about placement if exiting */
1621 if (p->state == TASK_DEAD)
1622 return;
1623
1624 /* Allocate buffer to track faults on a per-node basis */
1625 if (unlikely(!p->numa_faults)) {
1626 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1627
1628 /* numa_faults and numa_faults_buffer share the allocation */
1629 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1630 if (!p->numa_faults)
1631 return;
1632
1633 BUG_ON(p->numa_faults_buffer);
1634 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1635 p->total_numa_faults = 0;
1636 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1637 }
858 1638
859 /* 1639 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1640 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1641 * to be private if the accessing pid has not changed
862 */ 1642 */
863 if (!migrated) 1643 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1644 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1645 } else {
1646 priv = cpupid_match_pid(p, last_cpupid);
1647 if (!priv && !(flags & TNF_NO_GROUP))
1648 task_numa_group(p, last_cpupid, flags, &priv);
1649 }
866 1650
867 task_numa_placement(p); 1651 task_numa_placement(p);
1652
1653 /*
1654 * Retry task to preferred node migration periodically, in case it
1655 * case it previously failed, or the scheduler moved us.
1656 */
1657 if (time_after(jiffies, p->numa_migrate_retry))
1658 numa_migrate_preferred(p);
1659
1660 if (migrated)
1661 p->numa_pages_migrated += pages;
1662
1663 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1664 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1665}
869 1666
870static void reset_ptenuma_scan(struct task_struct *p) 1667static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1681,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1681 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1682 struct vm_area_struct *vma;
886 unsigned long start, end; 1683 unsigned long start, end;
1684 unsigned long nr_pte_updates = 0;
887 long pages; 1685 long pages;
888 1686
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1687 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1698,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1698 if (p->flags & PF_EXITING)
901 return; 1699 return;
902 1700
903 /* 1701 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1702 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1703 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1704 }
933 1705
934 /* 1706 /*
@@ -938,20 +1710,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1710 if (time_before(now, migrate))
939 return; 1711 return;
940 1712
941 if (p->numa_scan_period == 0) 1713 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1714 p->numa_scan_period_max = task_scan_max(p);
1715 p->numa_scan_period = task_scan_min(p);
1716 }
943 1717
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1718 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1719 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1720 return;
947 1721
948 /* 1722 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1723 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1724 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1725 */
953 if (migrate_ratelimited(numa_node_id())) 1726 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1727
956 start = mm->numa_scan_offset; 1728 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1729 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1739,32 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1739 vma = mm->mmap;
968 } 1740 }
969 for (; vma; vma = vma->vm_next) { 1741 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1742 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1743 continue;
972 1744
973 /* Skip small VMAs. They are not likely to be of relevance */ 1745 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1746 * Shared library pages mapped by multiple processes are not
1747 * migrated as it is expected they are cache replicated. Avoid
1748 * hinting faults in read-only file-backed mappings or the vdso
1749 * as migrating the pages will be of marginal benefit.
1750 */
1751 if (!vma->vm_mm ||
1752 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
975 continue; 1753 continue;
976 1754
977 do { 1755 do {
978 start = max(start, vma->vm_start); 1756 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1757 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1758 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1759 nr_pte_updates += change_prot_numa(vma, start, end);
1760
1761 /*
1762 * Scan sysctl_numa_balancing_scan_size but ensure that
1763 * at least one PTE is updated so that unused virtual
1764 * address space is quickly skipped.
1765 */
1766 if (nr_pte_updates)
1767 pages -= (end - start) >> PAGE_SHIFT;
982 1768
983 start = end; 1769 start = end;
984 if (pages <= 0) 1770 if (pages <= 0)
@@ -988,10 +1774,10 @@ void task_numa_work(struct callback_head *work)
988 1774
989out: 1775out:
990 /* 1776 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1777 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1778 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1779 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1780 * scanner to the start so check it now.
995 */ 1781 */
996 if (vma) 1782 if (vma)
997 mm->numa_scan_offset = start; 1783 mm->numa_scan_offset = start;
@@ -1025,8 +1811,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1811
1026 if (now - curr->node_stamp > period) { 1812 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1813 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1814 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1815 curr->node_stamp += period;
1030 1816
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1817 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1818 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1824,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1824static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1825{
1040} 1826}
1827
1828static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1829{
1830}
1831
1832static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1833{
1834}
1041#endif /* CONFIG_NUMA_BALANCING */ 1835#endif /* CONFIG_NUMA_BALANCING */
1042 1836
1043static void 1837static void
@@ -1047,8 +1841,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1841 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1842 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1843#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1844 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1845 struct rq *rq = rq_of(cfs_rq);
1846
1847 account_numa_enqueue(rq, task_of(se));
1848 list_add(&se->group_node, &rq->cfs_tasks);
1849 }
1052#endif 1850#endif
1053 cfs_rq->nr_running++; 1851 cfs_rq->nr_running++;
1054} 1852}
@@ -1059,8 +1857,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1857 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1858 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1859 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1860 if (entity_is_task(se)) {
1861 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1862 list_del_init(&se->group_node);
1863 }
1064 cfs_rq->nr_running--; 1864 cfs_rq->nr_running--;
1065} 1865}
1066 1866
@@ -1378,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1378 long contrib; 2178 long contrib;
1379 2179
1380 /* The fraction of a cpu used by this cfs_rq */ 2180 /* The fraction of a cpu used by this cfs_rq */
1381 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, 2181 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
1382 sa->runnable_avg_period + 1); 2182 sa->runnable_avg_period + 1);
1383 contrib -= cfs_rq->tg_runnable_contrib; 2183 contrib -= cfs_rq->tg_runnable_contrib;
1384 2184
@@ -2070,13 +2870,14 @@ static inline bool cfs_bandwidth_used(void)
2070 return static_key_false(&__cfs_bandwidth_used); 2870 return static_key_false(&__cfs_bandwidth_used);
2071} 2871}
2072 2872
2073void account_cfs_bandwidth_used(int enabled, int was_enabled) 2873void cfs_bandwidth_usage_inc(void)
2074{ 2874{
2075 /* only need to count groups transitioning between enabled/!enabled */ 2875 static_key_slow_inc(&__cfs_bandwidth_used);
2076 if (enabled && !was_enabled) 2876}
2077 static_key_slow_inc(&__cfs_bandwidth_used); 2877
2078 else if (!enabled && was_enabled) 2878void cfs_bandwidth_usage_dec(void)
2079 static_key_slow_dec(&__cfs_bandwidth_used); 2879{
2880 static_key_slow_dec(&__cfs_bandwidth_used);
2080} 2881}
2081#else /* HAVE_JUMP_LABEL */ 2882#else /* HAVE_JUMP_LABEL */
2082static bool cfs_bandwidth_used(void) 2883static bool cfs_bandwidth_used(void)
@@ -2084,7 +2885,8 @@ static bool cfs_bandwidth_used(void)
2084 return true; 2885 return true;
2085} 2886}
2086 2887
2087void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 2888void cfs_bandwidth_usage_inc(void) {}
2889void cfs_bandwidth_usage_dec(void) {}
2088#endif /* HAVE_JUMP_LABEL */ 2890#endif /* HAVE_JUMP_LABEL */
2089 2891
2090/* 2892/*
@@ -2335,6 +3137,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2335 cfs_rq->throttled_clock = rq_clock(rq); 3137 cfs_rq->throttled_clock = rq_clock(rq);
2336 raw_spin_lock(&cfs_b->lock); 3138 raw_spin_lock(&cfs_b->lock);
2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3139 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3140 if (!cfs_b->timer_active)
3141 __start_cfs_bandwidth(cfs_b);
2338 raw_spin_unlock(&cfs_b->lock); 3142 raw_spin_unlock(&cfs_b->lock);
2339} 3143}
2340 3144
@@ -2448,6 +3252,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2448 if (idle) 3252 if (idle)
2449 goto out_unlock; 3253 goto out_unlock;
2450 3254
3255 /*
3256 * if we have relooped after returning idle once, we need to update our
3257 * status as actually running, so that other cpus doing
3258 * __start_cfs_bandwidth will stop trying to cancel us.
3259 */
3260 cfs_b->timer_active = 1;
3261
2451 __refill_cfs_bandwidth_runtime(cfs_b); 3262 __refill_cfs_bandwidth_runtime(cfs_b);
2452 3263
2453 if (!throttled) { 3264 if (!throttled) {
@@ -2508,7 +3319,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508/* how long we wait to gather additional slack before distributing */ 3319/* how long we wait to gather additional slack before distributing */
2509static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 3320static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510 3321
2511/* are we near the end of the current quota period? */ 3322/*
3323 * Are we near the end of the current quota period?
3324 *
3325 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3326 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3327 * migrate_hrtimers, base is never cleared, so we are fine.
3328 */
2512static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 3329static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513{ 3330{
2514 struct hrtimer *refresh_timer = &cfs_b->period_timer; 3331 struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3401,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2584 u64 expires; 3401 u64 expires;
2585 3402
2586 /* confirm we're still not at a refresh boundary */ 3403 /* confirm we're still not at a refresh boundary */
2587 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) 3404 raw_spin_lock(&cfs_b->lock);
3405 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3406 raw_spin_unlock(&cfs_b->lock);
2588 return; 3407 return;
3408 }
2589 3409
2590 raw_spin_lock(&cfs_b->lock);
2591 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3410 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 runtime = cfs_b->runtime; 3411 runtime = cfs_b->runtime;
2593 cfs_b->runtime = 0; 3412 cfs_b->runtime = 0;
@@ -2708,11 +3527,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2708 * (timer_active==0 becomes visible before the hrtimer call-back 3527 * (timer_active==0 becomes visible before the hrtimer call-back
2709 * terminates). In either case we ensure that it's re-programmed 3528 * terminates). In either case we ensure that it's re-programmed
2710 */ 3529 */
2711 while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 3530 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3531 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3532 /* bounce the lock to allow do_sched_cfs_period_timer to run */
2712 raw_spin_unlock(&cfs_b->lock); 3533 raw_spin_unlock(&cfs_b->lock);
2713 /* ensure cfs_b->lock is available while we wait */ 3534 cpu_relax();
2714 hrtimer_cancel(&cfs_b->period_timer);
2715
2716 raw_spin_lock(&cfs_b->lock); 3535 raw_spin_lock(&cfs_b->lock);
2717 /* if someone else restarted the timer then we're done */ 3536 /* if someone else restarted the timer then we're done */
2718 if (cfs_b->timer_active) 3537 if (cfs_b->timer_active)
@@ -3113,7 +3932,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3113{ 3932{
3114 struct sched_entity *se = tg->se[cpu]; 3933 struct sched_entity *se = tg->se[cpu];
3115 3934
3116 if (!tg->parent) /* the trivial, non-cgroup case */ 3935 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3117 return wl; 3936 return wl;
3118 3937
3119 for_each_sched_entity(se) { 3938 for_each_sched_entity(se) {
@@ -3166,8 +3985,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3166} 3985}
3167#else 3986#else
3168 3987
3169static inline unsigned long effective_load(struct task_group *tg, int cpu, 3988static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3170 unsigned long wl, unsigned long wg)
3171{ 3989{
3172 return wl; 3990 return wl;
3173} 3991}
@@ -3420,11 +4238,10 @@ done:
3420 * preempt must be disabled. 4238 * preempt must be disabled.
3421 */ 4239 */
3422static int 4240static int
3423select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4241select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3424{ 4242{
3425 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4243 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 int cpu = smp_processor_id(); 4244 int cpu = smp_processor_id();
3427 int prev_cpu = task_cpu(p);
3428 int new_cpu = cpu; 4245 int new_cpu = cpu;
3429 int want_affine = 0; 4246 int want_affine = 0;
3430 int sync = wake_flags & WF_SYNC; 4247 int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4721,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3904 4721
3905static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4722static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 4723
4724enum fbq_type { regular, remote, all };
4725
3907#define LBF_ALL_PINNED 0x01 4726#define LBF_ALL_PINNED 0x01
3908#define LBF_NEED_BREAK 0x02 4727#define LBF_NEED_BREAK 0x02
3909#define LBF_SOME_PINNED 0x04 4728#define LBF_DST_PINNED 0x04
4729#define LBF_SOME_PINNED 0x08
3910 4730
3911struct lb_env { 4731struct lb_env {
3912 struct sched_domain *sd; 4732 struct sched_domain *sd;
@@ -3929,6 +4749,8 @@ struct lb_env {
3929 unsigned int loop; 4749 unsigned int loop;
3930 unsigned int loop_break; 4750 unsigned int loop_break;
3931 unsigned int loop_max; 4751 unsigned int loop_max;
4752
4753 enum fbq_type fbq_type;
3932}; 4754};
3933 4755
3934/* 4756/*
@@ -3975,6 +4797,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3975 return delta < (s64)sysctl_sched_migration_cost; 4797 return delta < (s64)sysctl_sched_migration_cost;
3976} 4798}
3977 4799
4800#ifdef CONFIG_NUMA_BALANCING
4801/* Returns true if the destination node has incurred more faults */
4802static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4803{
4804 int src_nid, dst_nid;
4805
4806 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4807 !(env->sd->flags & SD_NUMA)) {
4808 return false;
4809 }
4810
4811 src_nid = cpu_to_node(env->src_cpu);
4812 dst_nid = cpu_to_node(env->dst_cpu);
4813
4814 if (src_nid == dst_nid)
4815 return false;
4816
4817 /* Always encourage migration to the preferred node. */
4818 if (dst_nid == p->numa_preferred_nid)
4819 return true;
4820
4821 /* If both task and group weight improve, this move is a winner. */
4822 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4823 group_weight(p, dst_nid) > group_weight(p, src_nid))
4824 return true;
4825
4826 return false;
4827}
4828
4829
4830static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4831{
4832 int src_nid, dst_nid;
4833
4834 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4835 return false;
4836
4837 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4838 return false;
4839
4840 src_nid = cpu_to_node(env->src_cpu);
4841 dst_nid = cpu_to_node(env->dst_cpu);
4842
4843 if (src_nid == dst_nid)
4844 return false;
4845
4846 /* Migrating away from the preferred node is always bad. */
4847 if (src_nid == p->numa_preferred_nid)
4848 return true;
4849
4850 /* If either task or group weight get worse, don't do it. */
4851 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4852 group_weight(p, dst_nid) < group_weight(p, src_nid))
4853 return true;
4854
4855 return false;
4856}
4857
4858#else
4859static inline bool migrate_improves_locality(struct task_struct *p,
4860 struct lb_env *env)
4861{
4862 return false;
4863}
4864
4865static inline bool migrate_degrades_locality(struct task_struct *p,
4866 struct lb_env *env)
4867{
4868 return false;
4869}
4870#endif
4871
3978/* 4872/*
3979 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4873 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980 */ 4874 */
@@ -3997,6 +4891,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3997 4891
3998 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4892 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 4893
4894 env->flags |= LBF_SOME_PINNED;
4895
4000 /* 4896 /*
4001 * Remember if this task can be migrated to any other cpu in 4897 * Remember if this task can be migrated to any other cpu in
4002 * our sched_group. We may want to revisit it if we couldn't 4898 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4901,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4005 * Also avoid computing new_dst_cpu if we have already computed 4901 * Also avoid computing new_dst_cpu if we have already computed
4006 * one in current iteration. 4902 * one in current iteration.
4007 */ 4903 */
4008 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4904 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4009 return 0; 4905 return 0;
4010 4906
4011 /* Prevent to re-select dst_cpu via env's cpus */ 4907 /* Prevent to re-select dst_cpu via env's cpus */
4012 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4908 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4909 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 env->flags |= LBF_SOME_PINNED; 4910 env->flags |= LBF_DST_PINNED;
4015 env->new_dst_cpu = cpu; 4911 env->new_dst_cpu = cpu;
4016 break; 4912 break;
4017 } 4913 }
@@ -4030,11 +4926,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4030 4926
4031 /* 4927 /*
4032 * Aggressive migration if: 4928 * Aggressive migration if:
4033 * 1) task is cache cold, or 4929 * 1) destination numa is preferred
4034 * 2) too many balance attempts have failed. 4930 * 2) task is cache cold, or
4931 * 3) too many balance attempts have failed.
4035 */ 4932 */
4036
4037 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4933 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4934 if (!tsk_cache_hot)
4935 tsk_cache_hot = migrate_degrades_locality(p, env);
4936
4937 if (migrate_improves_locality(p, env)) {
4938#ifdef CONFIG_SCHEDSTATS
4939 if (tsk_cache_hot) {
4940 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4941 schedstat_inc(p, se.statistics.nr_forced_migrations);
4942 }
4943#endif
4944 return 1;
4945 }
4946
4038 if (!tsk_cache_hot || 4947 if (!tsk_cache_hot ||
4039 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4948 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 4949
@@ -4077,8 +4986,6 @@ static int move_one_task(struct lb_env *env)
4077 return 0; 4986 return 0;
4078} 4987}
4079 4988
4080static unsigned long task_h_load(struct task_struct *p);
4081
4082static const unsigned int sched_nr_migrate_break = 32; 4989static const unsigned int sched_nr_migrate_break = 32;
4083 4990
4084/* 4991/*
@@ -4242,7 +5149,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4242 } 5149 }
4243 5150
4244 if (!se) { 5151 if (!se) {
4245 cfs_rq->h_load = rq->avg.load_avg_contrib; 5152 cfs_rq->h_load = cfs_rq->runnable_load_avg;
4246 cfs_rq->last_h_load_update = now; 5153 cfs_rq->last_h_load_update = now;
4247 } 5154 }
4248 5155
@@ -4291,6 +5198,10 @@ struct sg_lb_stats {
4291 unsigned int group_weight; 5198 unsigned int group_weight;
4292 int group_imb; /* Is there an imbalance in the group ? */ 5199 int group_imb; /* Is there an imbalance in the group ? */
4293 int group_has_capacity; /* Is there extra capacity in the group? */ 5200 int group_has_capacity; /* Is there extra capacity in the group? */
5201#ifdef CONFIG_NUMA_BALANCING
5202 unsigned int nr_numa_running;
5203 unsigned int nr_preferred_running;
5204#endif
4294}; 5205};
4295 5206
4296/* 5207/*
@@ -4330,7 +5241,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4330/** 5241/**
4331 * get_sd_load_idx - Obtain the load index for a given sched domain. 5242 * get_sd_load_idx - Obtain the load index for a given sched domain.
4332 * @sd: The sched_domain whose load_idx is to be obtained. 5243 * @sd: The sched_domain whose load_idx is to be obtained.
4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 5244 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
4334 * 5245 *
4335 * Return: The load index. 5246 * Return: The load index.
4336 */ 5247 */
@@ -4447,7 +5358,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4447{ 5358{
4448 struct sched_domain *child = sd->child; 5359 struct sched_domain *child = sd->child;
4449 struct sched_group *group, *sdg = sd->groups; 5360 struct sched_group *group, *sdg = sd->groups;
4450 unsigned long power; 5361 unsigned long power, power_orig;
4451 unsigned long interval; 5362 unsigned long interval;
4452 5363
4453 interval = msecs_to_jiffies(sd->balance_interval); 5364 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5370,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4459 return; 5370 return;
4460 } 5371 }
4461 5372
4462 power = 0; 5373 power_orig = power = 0;
4463 5374
4464 if (child->flags & SD_OVERLAP) { 5375 if (child->flags & SD_OVERLAP) {
4465 /* 5376 /*
@@ -4467,8 +5378,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
4467 * span the current group. 5378 * span the current group.
4468 */ 5379 */
4469 5380
4470 for_each_cpu(cpu, sched_group_cpus(sdg)) 5381 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4471 power += power_of(cpu); 5382 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
5383
5384 power_orig += sg->sgp->power_orig;
5385 power += sg->sgp->power;
5386 }
4472 } else { 5387 } else {
4473 /* 5388 /*
4474 * !SD_OVERLAP domains can assume that child groups 5389 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5392,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4477 5392
4478 group = child->groups; 5393 group = child->groups;
4479 do { 5394 do {
5395 power_orig += group->sgp->power_orig;
4480 power += group->sgp->power; 5396 power += group->sgp->power;
4481 group = group->next; 5397 group = group->next;
4482 } while (group != child->groups); 5398 } while (group != child->groups);
4483 } 5399 }
4484 5400
4485 sdg->sgp->power_orig = sdg->sgp->power = power; 5401 sdg->sgp->power_orig = power_orig;
5402 sdg->sgp->power = power;
4486} 5403}
4487 5404
4488/* 5405/*
@@ -4526,13 +5443,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4526 * cpu 3 and leave one of the cpus in the second group unused. 5443 * cpu 3 and leave one of the cpus in the second group unused.
4527 * 5444 *
4528 * The current solution to this issue is detecting the skew in the first group 5445 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are 5446 * by noticing the lower domain failed to reach balance and had difficulty
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see 5447 * moving tasks due to affinity constraints.
4531 * sg_imbalanced().
4532 * 5448 *
4533 * When this is so detected; this group becomes a candidate for busiest; see 5449 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and 5450 * update_sd_pick_busiest(). And calculate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it 5451 * find_busiest_group() avoid some of the usual balance conditions to allow it
4536 * to create an effective group imbalance. 5452 * to create an effective group imbalance.
4537 * 5453 *
4538 * This is a somewhat tricky proposition since the next run might not find the 5454 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5456,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4540 * subtle and fragile situation. 5456 * subtle and fragile situation.
4541 */ 5457 */
4542 5458
4543struct sg_imb_stats { 5459static inline int sg_imbalanced(struct sched_group *group)
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{ 5460{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL; 5461 return group->sgp->imbalance;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552} 5462}
4553 5463
4554static inline void 5464/*
4555update_sg_imb_stats(struct sg_imb_stats *sgi, 5465 * Compute the group capacity.
4556 unsigned long load, unsigned long nr_running) 5466 *
5467 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5468 * first dividing out the smt factor and computing the actual number of cores
5469 * and limit power unit capacity with that.
5470 */
5471static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4557{ 5472{
4558 if (load > sgi->max_cpu_load) 5473 unsigned int capacity, smt, cpus;
4559 sgi->max_cpu_load = load; 5474 unsigned int power, power_orig;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562 5475
4563 if (nr_running > sgi->max_nr_running) 5476 power = group->sgp->power;
4564 sgi->max_nr_running = nr_running; 5477 power_orig = group->sgp->power_orig;
4565 if (sgi->min_nr_running > nr_running) 5478 cpus = group->group_weight;
4566 sgi->min_nr_running = nr_running;
4567}
4568 5479
4569static inline int 5480 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) 5481 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4571{ 5482 capacity = cpus / smt; /* cores */
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584 5483
4585 return 0; 5484 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5485 if (!capacity)
5486 capacity = fix_small_capacity(env->sd, group);
5487
5488 return capacity;
4586} 5489}
4587 5490
4588/** 5491/**
@@ -4597,12 +5500,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4597 struct sched_group *group, int load_idx, 5500 struct sched_group *group, int load_idx,
4598 int local_group, struct sg_lb_stats *sgs) 5501 int local_group, struct sg_lb_stats *sgs)
4599{ 5502{
4600 struct sg_imb_stats sgi;
4601 unsigned long nr_running; 5503 unsigned long nr_running;
4602 unsigned long load; 5504 unsigned long load;
4603 int i; 5505 int i;
4604 5506
4605 init_sg_imb_stats(&sgi); 5507 memset(sgs, 0, sizeof(*sgs));
4606 5508
4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5509 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 struct rq *rq = cpu_rq(i); 5510 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5512,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4610 nr_running = rq->nr_running; 5512 nr_running = rq->nr_running;
4611 5513
4612 /* Bias balancing toward cpus of our domain */ 5514 /* Bias balancing toward cpus of our domain */
4613 if (local_group) { 5515 if (local_group)
4614 load = target_load(i, load_idx); 5516 load = target_load(i, load_idx);
4615 } else { 5517 else
4616 load = source_load(i, load_idx); 5518 load = source_load(i, load_idx);
4617 update_sg_imb_stats(&sgi, load, nr_running);
4618 }
4619 5519
4620 sgs->group_load += load; 5520 sgs->group_load += load;
4621 sgs->sum_nr_running += nr_running; 5521 sgs->sum_nr_running += nr_running;
5522#ifdef CONFIG_NUMA_BALANCING
5523 sgs->nr_numa_running += rq->nr_numa_running;
5524 sgs->nr_preferred_running += rq->nr_preferred_running;
5525#endif
4622 sgs->sum_weighted_load += weighted_cpuload(i); 5526 sgs->sum_weighted_load += weighted_cpuload(i);
4623 if (idle_cpu(i)) 5527 if (idle_cpu(i))
4624 sgs->idle_cpus++; 5528 sgs->idle_cpus++;
4625 } 5529 }
4626 5530
4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 time_after_eq(jiffies, group->sgp->next_update)))
4629 update_group_power(env->sd, env->dst_cpu);
4630
4631 /* Adjust by relative CPU power of the group */ 5531 /* Adjust by relative CPU power of the group */
4632 sgs->group_power = group->sgp->power; 5532 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5533 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5535,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4635 if (sgs->sum_nr_running) 5535 if (sgs->sum_nr_running)
4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5536 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 5537
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639
4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642
4643 if (!sgs->group_capacity)
4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4646 sgs->group_weight = group->group_weight; 5538 sgs->group_weight = group->group_weight;
4647 5539
5540 sgs->group_imb = sg_imbalanced(group);
5541 sgs->group_capacity = sg_capacity(env, group);
5542
4648 if (sgs->group_capacity > sgs->sum_nr_running) 5543 if (sgs->group_capacity > sgs->sum_nr_running)
4649 sgs->group_has_capacity = 1; 5544 sgs->group_has_capacity = 1;
4650} 5545}
@@ -4693,14 +5588,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4693 return false; 5588 return false;
4694} 5589}
4695 5590
5591#ifdef CONFIG_NUMA_BALANCING
5592static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5593{
5594 if (sgs->sum_nr_running > sgs->nr_numa_running)
5595 return regular;
5596 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5597 return remote;
5598 return all;
5599}
5600
5601static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5602{
5603 if (rq->nr_running > rq->nr_numa_running)
5604 return regular;
5605 if (rq->nr_running > rq->nr_preferred_running)
5606 return remote;
5607 return all;
5608}
5609#else
5610static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5611{
5612 return all;
5613}
5614
5615static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5616{
5617 return regular;
5618}
5619#endif /* CONFIG_NUMA_BALANCING */
5620
4696/** 5621/**
4697 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5622 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698 * @env: The load balancing environment. 5623 * @env: The load balancing environment.
4699 * @balance: Should we balance.
4700 * @sds: variable to hold the statistics for this sched_domain. 5624 * @sds: variable to hold the statistics for this sched_domain.
4701 */ 5625 */
4702static inline void update_sd_lb_stats(struct lb_env *env, 5626static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4703 struct sd_lb_stats *sds)
4704{ 5627{
4705 struct sched_domain *child = env->sd->child; 5628 struct sched_domain *child = env->sd->child;
4706 struct sched_group *sg = env->sd->groups; 5629 struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5643,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4720 if (local_group) { 5643 if (local_group) {
4721 sds->local = sg; 5644 sds->local = sg;
4722 sgs = &sds->local_stat; 5645 sgs = &sds->local_stat;
5646
5647 if (env->idle != CPU_NEWLY_IDLE ||
5648 time_after_eq(jiffies, sg->sgp->next_update))
5649 update_group_power(env->sd, env->dst_cpu);
4723 } 5650 }
4724 5651
4725 memset(sgs, 0, sizeof(*sgs));
4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 5652 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 5653
5654 if (local_group)
5655 goto next_group;
5656
4728 /* 5657 /*
4729 * In case the child domain prefers tasks go to siblings 5658 * In case the child domain prefers tasks go to siblings
4730 * first, lower the sg capacity to one so that we'll try 5659 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5664,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4735 * heaviest group when it is already under-utilized (possible 5664 * heaviest group when it is already under-utilized (possible
4736 * with a large weight task outweighs the tasks on the system). 5665 * with a large weight task outweighs the tasks on the system).
4737 */ 5666 */
4738 if (prefer_sibling && !local_group && 5667 if (prefer_sibling && sds->local &&
4739 sds->local && sds->local_stat.group_has_capacity) 5668 sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U); 5669 sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 5670
4742 /* Now, start updating sd_lb_stats */ 5671 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4743 sds->total_load += sgs->group_load;
4744 sds->total_pwr += sgs->group_power;
4745
4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 sds->busiest = sg; 5672 sds->busiest = sg;
4748 sds->busiest_stat = *sgs; 5673 sds->busiest_stat = *sgs;
4749 } 5674 }
4750 5675
5676next_group:
5677 /* Now, start updating sd_lb_stats */
5678 sds->total_load += sgs->group_load;
5679 sds->total_pwr += sgs->group_power;
5680
4751 sg = sg->next; 5681 sg = sg->next;
4752 } while (sg != env->sd->groups); 5682 } while (sg != env->sd->groups);
5683
5684 if (env->sd->flags & SD_NUMA)
5685 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4753} 5686}
4754 5687
4755/** 5688/**
@@ -4823,8 +5756,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4823 (busiest->load_per_task * SCHED_POWER_SCALE) / 5756 (busiest->load_per_task * SCHED_POWER_SCALE) /
4824 busiest->group_power; 5757 busiest->group_power;
4825 5758
4826 if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= 5759 if (busiest->avg_load + scaled_busy_load_per_task >=
4827 (scaled_busy_load_per_task * imbn)) { 5760 local->avg_load + (scaled_busy_load_per_task * imbn)) {
4828 env->imbalance = busiest->load_per_task; 5761 env->imbalance = busiest->load_per_task;
4829 return; 5762 return;
4830 } 5763 }
@@ -4896,7 +5829,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4896 * max load less than avg load(as we skip the groups at or below 5829 * max load less than avg load(as we skip the groups at or below
4897 * its cpu_power, while calculating max_load..) 5830 * its cpu_power, while calculating max_load..)
4898 */ 5831 */
4899 if (busiest->avg_load < sds->avg_load) { 5832 if (busiest->avg_load <= sds->avg_load ||
5833 local->avg_load >= sds->avg_load) {
4900 env->imbalance = 0; 5834 env->imbalance = 0;
4901 return fix_small_imbalance(env, sds); 5835 return fix_small_imbalance(env, sds);
4902 } 5836 }
@@ -5052,15 +5986,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5052 int i; 5986 int i;
5053 5987
5054 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5988 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5055 unsigned long power = power_of(i); 5989 unsigned long power, capacity, wl;
5056 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5990 enum fbq_type rt;
5057 SCHED_POWER_SCALE);
5058 unsigned long wl;
5059 5991
5992 rq = cpu_rq(i);
5993 rt = fbq_classify_rq(rq);
5994
5995 /*
5996 * We classify groups/runqueues into three groups:
5997 * - regular: there are !numa tasks
5998 * - remote: there are numa tasks that run on the 'wrong' node
5999 * - all: there is no distinction
6000 *
6001 * In order to avoid migrating ideally placed numa tasks,
6002 * ignore those when there's better options.
6003 *
6004 * If we ignore the actual busiest queue to migrate another
6005 * task, the next balance pass can still reduce the busiest
6006 * queue by moving tasks around inside the node.
6007 *
6008 * If we cannot move enough load due to this classification
6009 * the next pass will adjust the group classification and
6010 * allow migration of more tasks.
6011 *
6012 * Both cases only affect the total convergence complexity.
6013 */
6014 if (rt > env->fbq_type)
6015 continue;
6016
6017 power = power_of(i);
6018 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5060 if (!capacity) 6019 if (!capacity)
5061 capacity = fix_small_capacity(env->sd, group); 6020 capacity = fix_small_capacity(env->sd, group);
5062 6021
5063 rq = cpu_rq(i);
5064 wl = weighted_cpuload(i); 6022 wl = weighted_cpuload(i);
5065 6023
5066 /* 6024 /*
@@ -5151,7 +6109,7 @@ static int should_we_balance(struct lb_env *env)
5151 * First idle cpu or the first cpu(busiest) in this sched group 6109 * First idle cpu or the first cpu(busiest) in this sched group
5152 * is eligible for doing load balancing at this and above domains. 6110 * is eligible for doing load balancing at this and above domains.
5153 */ 6111 */
5154 return balance_cpu != env->dst_cpu; 6112 return balance_cpu == env->dst_cpu;
5155} 6113}
5156 6114
5157/* 6115/*
@@ -5163,6 +6121,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5163 int *continue_balancing) 6121 int *continue_balancing)
5164{ 6122{
5165 int ld_moved, cur_ld_moved, active_balance = 0; 6123 int ld_moved, cur_ld_moved, active_balance = 0;
6124 struct sched_domain *sd_parent = sd->parent;
5166 struct sched_group *group; 6125 struct sched_group *group;
5167 struct rq *busiest; 6126 struct rq *busiest;
5168 unsigned long flags; 6127 unsigned long flags;
@@ -5176,6 +6135,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5176 .idle = idle, 6135 .idle = idle,
5177 .loop_break = sched_nr_migrate_break, 6136 .loop_break = sched_nr_migrate_break,
5178 .cpus = cpus, 6137 .cpus = cpus,
6138 .fbq_type = all,
5179 }; 6139 };
5180 6140
5181 /* 6141 /*
@@ -5267,17 +6227,17 @@ more_balance:
5267 * moreover subsequent load balance cycles should correct the 6227 * moreover subsequent load balance cycles should correct the
5268 * excess load moved. 6228 * excess load moved.
5269 */ 6229 */
5270 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6230 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6231
6232 /* Prevent to re-select dst_cpu via env's cpus */
6233 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5271 6234
5272 env.dst_rq = cpu_rq(env.new_dst_cpu); 6235 env.dst_rq = cpu_rq(env.new_dst_cpu);
5273 env.dst_cpu = env.new_dst_cpu; 6236 env.dst_cpu = env.new_dst_cpu;
5274 env.flags &= ~LBF_SOME_PINNED; 6237 env.flags &= ~LBF_DST_PINNED;
5275 env.loop = 0; 6238 env.loop = 0;
5276 env.loop_break = sched_nr_migrate_break; 6239 env.loop_break = sched_nr_migrate_break;
5277 6240
5278 /* Prevent to re-select dst_cpu via env's cpus */
5279 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5280
5281 /* 6241 /*
5282 * Go back to "more_balance" rather than "redo" since we 6242 * Go back to "more_balance" rather than "redo" since we
5283 * need to continue with same src_cpu. 6243 * need to continue with same src_cpu.
@@ -5285,6 +6245,18 @@ more_balance:
5285 goto more_balance; 6245 goto more_balance;
5286 } 6246 }
5287 6247
6248 /*
6249 * We failed to reach balance because of affinity.
6250 */
6251 if (sd_parent) {
6252 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6253
6254 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6255 *group_imbalance = 1;
6256 } else if (*group_imbalance)
6257 *group_imbalance = 0;
6258 }
6259
5288 /* All tasks on this runqueue were pinned by CPU affinity */ 6260 /* All tasks on this runqueue were pinned by CPU affinity */
5289 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6261 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5290 cpumask_clear_cpu(cpu_of(busiest), cpus); 6262 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5392,6 +6364,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5392 struct sched_domain *sd; 6364 struct sched_domain *sd;
5393 int pulled_task = 0; 6365 int pulled_task = 0;
5394 unsigned long next_balance = jiffies + HZ; 6366 unsigned long next_balance = jiffies + HZ;
6367 u64 curr_cost = 0;
5395 6368
5396 this_rq->idle_stamp = rq_clock(this_rq); 6369 this_rq->idle_stamp = rq_clock(this_rq);
5397 6370
@@ -5408,15 +6381,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5408 for_each_domain(this_cpu, sd) { 6381 for_each_domain(this_cpu, sd) {
5409 unsigned long interval; 6382 unsigned long interval;
5410 int continue_balancing = 1; 6383 int continue_balancing = 1;
6384 u64 t0, domain_cost;
5411 6385
5412 if (!(sd->flags & SD_LOAD_BALANCE)) 6386 if (!(sd->flags & SD_LOAD_BALANCE))
5413 continue; 6387 continue;
5414 6388
6389 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6390 break;
6391
5415 if (sd->flags & SD_BALANCE_NEWIDLE) { 6392 if (sd->flags & SD_BALANCE_NEWIDLE) {
6393 t0 = sched_clock_cpu(this_cpu);
6394
5416 /* If we've pulled tasks over stop searching: */ 6395 /* If we've pulled tasks over stop searching: */
5417 pulled_task = load_balance(this_cpu, this_rq, 6396 pulled_task = load_balance(this_cpu, this_rq,
5418 sd, CPU_NEWLY_IDLE, 6397 sd, CPU_NEWLY_IDLE,
5419 &continue_balancing); 6398 &continue_balancing);
6399
6400 domain_cost = sched_clock_cpu(this_cpu) - t0;
6401 if (domain_cost > sd->max_newidle_lb_cost)
6402 sd->max_newidle_lb_cost = domain_cost;
6403
6404 curr_cost += domain_cost;
5420 } 6405 }
5421 6406
5422 interval = msecs_to_jiffies(sd->balance_interval); 6407 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5438,6 +6423,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5438 */ 6423 */
5439 this_rq->next_balance = next_balance; 6424 this_rq->next_balance = next_balance;
5440 } 6425 }
6426
6427 if (curr_cost > this_rq->max_idle_balance_cost)
6428 this_rq->max_idle_balance_cost = curr_cost;
5441} 6429}
5442 6430
5443/* 6431/*
@@ -5571,16 +6559,16 @@ static inline void nohz_balance_exit_idle(int cpu)
5571static inline void set_cpu_sd_state_busy(void) 6559static inline void set_cpu_sd_state_busy(void)
5572{ 6560{
5573 struct sched_domain *sd; 6561 struct sched_domain *sd;
6562 int cpu = smp_processor_id();
5574 6563
5575 rcu_read_lock(); 6564 rcu_read_lock();
5576 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6565 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5577 6566
5578 if (!sd || !sd->nohz_idle) 6567 if (!sd || !sd->nohz_idle)
5579 goto unlock; 6568 goto unlock;
5580 sd->nohz_idle = 0; 6569 sd->nohz_idle = 0;
5581 6570
5582 for (; sd; sd = sd->parent) 6571 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5583 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5584unlock: 6572unlock:
5585 rcu_read_unlock(); 6573 rcu_read_unlock();
5586} 6574}
@@ -5588,16 +6576,16 @@ unlock:
5588void set_cpu_sd_state_idle(void) 6576void set_cpu_sd_state_idle(void)
5589{ 6577{
5590 struct sched_domain *sd; 6578 struct sched_domain *sd;
6579 int cpu = smp_processor_id();
5591 6580
5592 rcu_read_lock(); 6581 rcu_read_lock();
5593 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6582 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5594 6583
5595 if (!sd || sd->nohz_idle) 6584 if (!sd || sd->nohz_idle)
5596 goto unlock; 6585 goto unlock;
5597 sd->nohz_idle = 1; 6586 sd->nohz_idle = 1;
5598 6587
5599 for (; sd; sd = sd->parent) 6588 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5600 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5601unlock: 6589unlock:
5602 rcu_read_unlock(); 6590 rcu_read_unlock();
5603} 6591}
@@ -5661,15 +6649,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5661 /* Earliest time when we have to do rebalance again */ 6649 /* Earliest time when we have to do rebalance again */
5662 unsigned long next_balance = jiffies + 60*HZ; 6650 unsigned long next_balance = jiffies + 60*HZ;
5663 int update_next_balance = 0; 6651 int update_next_balance = 0;
5664 int need_serialize; 6652 int need_serialize, need_decay = 0;
6653 u64 max_cost = 0;
5665 6654
5666 update_blocked_averages(cpu); 6655 update_blocked_averages(cpu);
5667 6656
5668 rcu_read_lock(); 6657 rcu_read_lock();
5669 for_each_domain(cpu, sd) { 6658 for_each_domain(cpu, sd) {
6659 /*
6660 * Decay the newidle max times here because this is a regular
6661 * visit to all the domains. Decay ~1% per second.
6662 */
6663 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6664 sd->max_newidle_lb_cost =
6665 (sd->max_newidle_lb_cost * 253) / 256;
6666 sd->next_decay_max_lb_cost = jiffies + HZ;
6667 need_decay = 1;
6668 }
6669 max_cost += sd->max_newidle_lb_cost;
6670
5670 if (!(sd->flags & SD_LOAD_BALANCE)) 6671 if (!(sd->flags & SD_LOAD_BALANCE))
5671 continue; 6672 continue;
5672 6673
6674 /*
6675 * Stop the load balance at this level. There is another
6676 * CPU in our sched group which is doing load balancing more
6677 * actively.
6678 */
6679 if (!continue_balancing) {
6680 if (need_decay)
6681 continue;
6682 break;
6683 }
6684
5673 interval = sd->balance_interval; 6685 interval = sd->balance_interval;
5674 if (idle != CPU_IDLE) 6686 if (idle != CPU_IDLE)
5675 interval *= sd->busy_factor; 6687 interval *= sd->busy_factor;
@@ -5688,7 +6700,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5688 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6700 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5689 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 6701 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5690 /* 6702 /*
5691 * The LBF_SOME_PINNED logic could have changed 6703 * The LBF_DST_PINNED logic could have changed
5692 * env->dst_cpu, so we can't know our idle 6704 * env->dst_cpu, so we can't know our idle
5693 * state even if we migrated tasks. Update it. 6705 * state even if we migrated tasks. Update it.
5694 */ 6706 */
@@ -5703,14 +6715,14 @@ out:
5703 next_balance = sd->last_balance + interval; 6715 next_balance = sd->last_balance + interval;
5704 update_next_balance = 1; 6716 update_next_balance = 1;
5705 } 6717 }
5706 6718 }
6719 if (need_decay) {
5707 /* 6720 /*
5708 * Stop the load balance at this level. There is another 6721 * Ensure the rq-wide value also decays but keep it at a
5709 * CPU in our sched group which is doing load balancing more 6722 * reasonable floor to avoid funnies with rq->avg_idle.
5710 * actively.
5711 */ 6723 */
5712 if (!continue_balancing) 6724 rq->max_idle_balance_cost =
5713 break; 6725 max((u64)sysctl_sched_migration_cost, max_cost);
5714 } 6726 }
5715 rcu_read_unlock(); 6727 rcu_read_unlock();
5716 6728
@@ -5780,6 +6792,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5780{ 6792{
5781 unsigned long now = jiffies; 6793 unsigned long now = jiffies;
5782 struct sched_domain *sd; 6794 struct sched_domain *sd;
6795 struct sched_group_power *sgp;
6796 int nr_busy;
5783 6797
5784 if (unlikely(idle_cpu(cpu))) 6798 if (unlikely(idle_cpu(cpu)))
5785 return 0; 6799 return 0;
@@ -5805,22 +6819,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5805 goto need_kick; 6819 goto need_kick;
5806 6820
5807 rcu_read_lock(); 6821 rcu_read_lock();
5808 for_each_domain(cpu, sd) { 6822 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5809 struct sched_group *sg = sd->groups;
5810 struct sched_group_power *sgp = sg->sgp;
5811 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5812 6823
5813 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) 6824 if (sd) {
5814 goto need_kick_unlock; 6825 sgp = sd->groups->sgp;
6826 nr_busy = atomic_read(&sgp->nr_busy_cpus);
5815 6827
5816 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight 6828 if (nr_busy > 1)
5817 && (cpumask_first_and(nohz.idle_cpus_mask,
5818 sched_domain_span(sd)) < cpu))
5819 goto need_kick_unlock; 6829 goto need_kick_unlock;
5820
5821 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5822 break;
5823 } 6830 }
6831
6832 sd = rcu_dereference(per_cpu(sd_asym, cpu));
6833
6834 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6835 sched_domain_span(sd)) < cpu))
6836 goto need_kick_unlock;
6837
5824 rcu_read_unlock(); 6838 rcu_read_unlock();
5825 return 0; 6839 return 0;
5826 6840
@@ -5928,11 +6942,15 @@ static void task_fork_fair(struct task_struct *p)
5928 cfs_rq = task_cfs_rq(current); 6942 cfs_rq = task_cfs_rq(current);
5929 curr = cfs_rq->curr; 6943 curr = cfs_rq->curr;
5930 6944
5931 if (unlikely(task_cpu(p) != this_cpu)) { 6945 /*
5932 rcu_read_lock(); 6946 * Not only the cpu but also the task_group of the parent might have
5933 __set_task_cpu(p, this_cpu); 6947 * been changed after parent->se.parent,cfs_rq were copied to
5934 rcu_read_unlock(); 6948 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
5935 } 6949 * of child point to valid ones.
6950 */
6951 rcu_read_lock();
6952 __set_task_cpu(p, this_cpu);
6953 rcu_read_unlock();
5936 6954
5937 update_curr(cfs_rq); 6955 update_curr(cfs_rq);
5938 6956
@@ -6209,7 +7227,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6209 se->cfs_rq = parent->my_q; 7227 se->cfs_rq = parent->my_q;
6210 7228
6211 se->my_q = cfs_rq; 7229 se->my_q = cfs_rq;
6212 update_load_set(&se->load, 0); 7230 /* guarantee group entities always have weight */
7231 update_load_set(&se->load, NICE_0_LOAD);
6213 se->parent = parent; 7232 se->parent = parent;
6214} 7233}
6215 7234
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false) 70
71/*
72 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
73 * higher number of hinting faults are recorded during active load
74 * balancing.
75 */
76SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
77
78/*
79 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
80 * lower number of hinting faults have been recorded. As this has
81 * the potential to prevent a task ever migrating to a new node
82 * due to CPU overload it is disabled by default.
83 */
84SCHED_FEAT(NUMA_RESIST_LOWER, false)
72#endif 85#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 12select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..7d57275fc396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
246 * if we should look at the mask. It would be a shame 246 * if we should look at the mask. It would be a shame
247 * if we looked at the mask, but the mask was not 247 * if we looked at the mask, but the mask was not
248 * updated yet. 248 * updated yet.
249 *
250 * Matched by the barrier in pull_rt_task().
249 */ 251 */
250 wmb(); 252 smp_wmb();
251 atomic_inc(&rq->rd->rto_count); 253 atomic_inc(&rq->rd->rto_count);
252} 254}
253 255
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
1169static int find_lowest_rq(struct task_struct *task); 1171static int find_lowest_rq(struct task_struct *task);
1170 1172
1171static int 1173static int
1172select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1174select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1173{ 1175{
1174 struct task_struct *curr; 1176 struct task_struct *curr;
1175 struct rq *rq; 1177 struct rq *rq;
1176 int cpu;
1177
1178 cpu = task_cpu(p);
1179 1178
1180 if (p->nr_cpus_allowed == 1) 1179 if (p->nr_cpus_allowed == 1)
1181 goto out; 1180 goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1213 */ 1212 */
1214 if (curr && unlikely(rt_task(curr)) && 1213 if (curr && unlikely(rt_task(curr)) &&
1215 (curr->nr_cpus_allowed < 2 || 1214 (curr->nr_cpus_allowed < 2 ||
1216 curr->prio <= p->prio) && 1215 curr->prio <= p->prio)) {
1217 (p->nr_cpus_allowed > 1)) {
1218 int target = find_lowest_rq(p); 1216 int target = find_lowest_rq(p);
1219 1217
1220 if (target != -1) 1218 if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
1630 if (likely(!rt_overloaded(this_rq))) 1628 if (likely(!rt_overloaded(this_rq)))
1631 return 0; 1629 return 0;
1632 1630
1631 /*
1632 * Match the barrier from rt_set_overloaded; this guarantees that if we
1633 * see overloaded we must also see the rto_mask bit.
1634 */
1635 smp_rmb();
1636
1633 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1637 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1634 if (this_cpu == cpu) 1638 if (this_cpu == cpu)
1635 continue; 1639 continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1931 p->rt.time_slice = sched_rr_timeslice; 1935 p->rt.time_slice = sched_rr_timeslice;
1932 1936
1933 /* 1937 /*
1934 * Requeue to the end of queue if we (and all of our ancestors) are the 1938 * Requeue to the end of queue if we (and all of our ancestors) are not
1935 * only element on the queue 1939 * the only element on the queue
1936 */ 1940 */
1937 for_each_sched_rt_entity(rt_se) { 1941 for_each_sched_rt_entity(rt_se) {
1938 if (rt_se->run_list.prev != rt_se->run_list.next) { 1942 if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..88c85b21d633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h> 8#include <linux/tick.h>
9#include <linux/slab.h>
9 10
10#include "cpupri.h" 11#include "cpupri.h"
11#include "cpuacct.h" 12#include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
408 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
409 */ 410 */
410 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
411 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
412 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
413 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
476 u64 age_stamp; 481 u64 age_stamp;
477 u64 idle_stamp; 482 u64 idle_stamp;
478 u64 avg_idle; 483 u64 avg_idle;
484
485 /* This is used to determine avg_idle's max value */
486 u64 max_idle_balance_cost;
479#endif 487#endif
480 488
481#ifdef CONFIG_IRQ_TIME_ACCOUNTING 489#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
552 return rq->clock_task; 560 return rq->clock_task;
553} 561}
554 562
563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
565extern int migrate_task_to(struct task_struct *p, int cpu);
566extern int migrate_swap(struct task_struct *, struct task_struct *);
567#endif /* CONFIG_NUMA_BALANCING */
568
555#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
556 570
557#define rcu_dereference_check_sched_domain(p) \ 571#define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
593 return hsd; 607 return hsd;
594} 608}
595 609
610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
611{
612 struct sched_domain *sd;
613
614 for_each_domain(cpu, sd) {
615 if (sd->flags & flag)
616 break;
617 }
618
619 return sd;
620}
621
596DECLARE_PER_CPU(struct sched_domain *, sd_llc); 622DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size); 623DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 624DECLARE_PER_CPU(int, sd_llc_id);
625DECLARE_PER_CPU(struct sched_domain *, sd_numa);
626DECLARE_PER_CPU(struct sched_domain *, sd_busy);
627DECLARE_PER_CPU(struct sched_domain *, sd_asym);
599 628
600struct sched_group_power { 629struct sched_group_power {
601 atomic_t ref; 630 atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
605 */ 634 */
606 unsigned int power, power_orig; 635 unsigned int power, power_orig;
607 unsigned long next_update; 636 unsigned long next_update;
637 int imbalance; /* XXX unrelated to power but shared group state */
608 /* 638 /*
609 * Number of busy cpus in this group. 639 * Number of busy cpus in this group.
610 */ 640 */
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
719 */ 749 */
720 smp_wmb(); 750 smp_wmb();
721 task_thread_info(p)->cpu = cpu; 751 task_thread_info(p)->cpu = cpu;
752 p->wake_cpu = cpu;
722#endif 753#endif
723} 754}
724 755
@@ -974,7 +1005,7 @@ struct sched_class {
974 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1005 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
975 1006
976#ifdef CONFIG_SMP 1007#ifdef CONFIG_SMP
977 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1008 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
978 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1009 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
979 1010
980 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1011 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1220 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1251 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1221} 1252}
1222 1253
1254static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1255{
1256 if (l1 > l2)
1257 swap(l1, l2);
1258
1259 spin_lock(l1);
1260 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1261}
1262
1263static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1264{
1265 if (l1 > l2)
1266 swap(l1, l2);
1267
1268 raw_spin_lock(l1);
1269 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1270}
1271
1223/* 1272/*
1224 * double_rq_lock - safely lock two runqueues 1273 * double_rq_lock - safely lock two runqueues
1225 * 1274 *
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1305extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1354extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1306extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1307 1356
1308extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1357extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void);
1309 1359
1310#ifdef CONFIG_NO_HZ_COMMON 1360#ifdef CONFIG_NO_HZ_COMMON
1311enum rq_nohz_flag_bits { 1361enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494fc8b4..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The 59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew. 60 * delta taken on each cpu would annul the skew.
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
63{ 63{
64 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 64 unsigned long long now = rq_clock(rq), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
69 sched_info_reset_dequeued(t); 69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta; 70 t->sched_info.run_delay += delta;
71 71
72 rq_sched_info_dequeued(task_rq(t), delta); 72 rq_sched_info_dequeued(rq, delta);
73} 73}
74 74
75/* 75/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
77 * long it was waiting to run. We also note when it began so that we 77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is. 78 * can keep stats on how long its timeslice is.
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct rq *rq, struct task_struct *t)
81{ 81{
82 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 82 unsigned long long now = rq_clock(rq), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
88 t->sched_info.last_arrival = now; 88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++; 89 t->sched_info.pcount++;
90 90
91 rq_sched_info_arrive(task_rq(t), delta); 91 rq_sched_info_arrive(rq, delta);
92} 92}
93 93
94/* 94/*
@@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t)
96 * the timestamp if it is already not set. It's assumed that 96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate. 97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */ 98 */
99static inline void sched_info_queued(struct task_struct *t) 99static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = rq_clock(task_rq(t)); 103 t->sched_info.last_queued = rq_clock(rq);
104} 104}
105 105
106/* 106/*
107 * Called when a process ceases being the active-running process, either 107 * Called when a process ceases being the active-running process involuntarily
108 * voluntarily or involuntarily. Now we can calculate how long we ran. 108 * due, typically, to expiring its time slice (this may also be called when
109 * switching to the idle task). Now we can calculate how long we ran.
109 * Also, if the process is still in the TASK_RUNNING state, call 110 * Also, if the process is still in the TASK_RUNNING state, call
110 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
111 * the runqueue. 112 * the runqueue.
112 */ 113 */
113static inline void sched_info_depart(struct task_struct *t) 114static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
114{ 115{
115 unsigned long long delta = rq_clock(task_rq(t)) - 116 unsigned long long delta = rq_clock(rq) -
116 t->sched_info.last_arrival; 117 t->sched_info.last_arrival;
117 118
118 rq_sched_info_depart(task_rq(t), delta); 119 rq_sched_info_depart(rq, delta);
119 120
120 if (t->state == TASK_RUNNING) 121 if (t->state == TASK_RUNNING)
121 sched_info_queued(t); 122 sched_info_queued(rq, t);
122} 123}
123 124
124/* 125/*
@@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
127 * the idle task.) We are only called when prev != next. 128 * the idle task.) We are only called when prev != next.
128 */ 129 */
129static inline void 130static inline void
130__sched_info_switch(struct task_struct *prev, struct task_struct *next) 131__sched_info_switch(struct rq *rq,
132 struct task_struct *prev, struct task_struct *next)
131{ 133{
132 struct rq *rq = task_rq(prev);
133
134 /* 134 /*
135 * prev now departs the cpu. It's not interesting to record 135 * prev now departs the cpu. It's not interesting to record
136 * stats about how efficient we were at scheduling the idle 136 * stats about how efficient we were at scheduling the idle
137 * process, however. 137 * process, however.
138 */ 138 */
139 if (prev != rq->idle) 139 if (prev != rq->idle)
140 sched_info_depart(prev); 140 sched_info_depart(rq, prev);
141 141
142 if (next != rq->idle) 142 if (next != rq->idle)
143 sched_info_arrive(next); 143 sched_info_arrive(rq, next);
144} 144}
145static inline void 145static inline void
146sched_info_switch(struct task_struct *prev, struct task_struct *next) 146sched_info_switch(struct rq *rq,
147 struct task_struct *prev, struct task_struct *next)
147{ 148{
148 if (unlikely(sched_info_on())) 149 if (unlikely(sched_info_on()))
149 __sched_info_switch(prev, next); 150 __sched_info_switch(rq, prev, next);
150} 151}
151#else 152#else
152#define sched_info_queued(t) do { } while (0) 153#define sched_info_queued(rq, t) do { } while (0)
153#define sched_info_reset_dequeued(t) do { } while (0) 154#define sched_info_reset_dequeued(t) do { } while (0)
154#define sched_info_dequeued(t) do { } while (0) 155#define sched_info_dequeued(rq, t) do { } while (0)
155#define sched_info_switch(t, next) do { } while (0) 156#define sched_info_depart(rq, t) do { } while (0)
157#define sched_info_arrive(rq, next) do { } while (0)
158#define sched_info_switch(rq, t, next) do { } while (0)
156#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 159#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
157 160
158/* 161/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
11 11
12#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
13static int 13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15{ 15{
16 return task_cpu(p); /* stop tasks as never migrate */ 16 return task_cpu(p); /* stop tasks as never migrate */
17} 17}
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index d550920e040c..7d50f794e248 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
53 53
54 54
55/* 55/*
56 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
57 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
58 * number) then we wake all the non-exclusive tasks and one exclusive task.
59 *
60 * There are circumstances in which we can try to wake a task which has already
61 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
62 * zero in this (rare) case, and we handle it by continuing to scan the queue.
63 */
64static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
65 int nr_exclusive, int wake_flags, void *key)
66{
67 wait_queue_t *curr, *next;
68
69 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
70 unsigned flags = curr->flags;
71
72 if (curr->func(curr, mode, wake_flags, key) &&
73 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
74 break;
75 }
76}
77
78/**
79 * __wake_up - wake up threads blocked on a waitqueue.
80 * @q: the waitqueue
81 * @mode: which threads
82 * @nr_exclusive: how many wake-one or wake-many threads to wake up
83 * @key: is directly passed to the wakeup function
84 *
85 * It may be assumed that this function implies a write memory barrier before
86 * changing the task state if and only if any tasks are woken up.
87 */
88void __wake_up(wait_queue_head_t *q, unsigned int mode,
89 int nr_exclusive, void *key)
90{
91 unsigned long flags;
92
93 spin_lock_irqsave(&q->lock, flags);
94 __wake_up_common(q, mode, nr_exclusive, 0, key);
95 spin_unlock_irqrestore(&q->lock, flags);
96}
97EXPORT_SYMBOL(__wake_up);
98
99/*
100 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
101 */
102void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
103{
104 __wake_up_common(q, mode, nr, 0, NULL);
105}
106EXPORT_SYMBOL_GPL(__wake_up_locked);
107
108void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
109{
110 __wake_up_common(q, mode, 1, 0, key);
111}
112EXPORT_SYMBOL_GPL(__wake_up_locked_key);
113
114/**
115 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
116 * @q: the waitqueue
117 * @mode: which threads
118 * @nr_exclusive: how many wake-one or wake-many threads to wake up
119 * @key: opaque value to be passed to wakeup targets
120 *
121 * The sync wakeup differs that the waker knows that it will schedule
122 * away soon, so while the target thread will be woken up, it will not
123 * be migrated to another CPU - ie. the two threads are 'synchronized'
124 * with each other. This can prevent needless bouncing between CPUs.
125 *
126 * On UP it can prevent extra preemption.
127 *
128 * It may be assumed that this function implies a write memory barrier before
129 * changing the task state if and only if any tasks are woken up.
130 */
131void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
132 int nr_exclusive, void *key)
133{
134 unsigned long flags;
135 int wake_flags = 1; /* XXX WF_SYNC */
136
137 if (unlikely(!q))
138 return;
139
140 if (unlikely(nr_exclusive != 1))
141 wake_flags = 0;
142
143 spin_lock_irqsave(&q->lock, flags);
144 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
145 spin_unlock_irqrestore(&q->lock, flags);
146}
147EXPORT_SYMBOL_GPL(__wake_up_sync_key);
148
149/*
150 * __wake_up_sync - see __wake_up_sync_key()
151 */
152void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
153{
154 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
155}
156EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
157
158/*
56 * Note: we use "set_current_state()" _after_ the wait-queue add, 159 * Note: we use "set_current_state()" _after_ the wait-queue add,
57 * because we need a memory barrier there on SMP, so that any 160 * because we need a memory barrier there on SMP, so that any
58 * wake-function that tests for the wait-queue being active 161 * wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 195}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 196EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 197
198long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
199{
200 unsigned long flags;
201
202 if (signal_pending_state(state, current))
203 return -ERESTARTSYS;
204
205 wait->private = current;
206 wait->func = autoremove_wake_function;
207
208 spin_lock_irqsave(&q->lock, flags);
209 if (list_empty(&wait->task_list)) {
210 if (wait->flags & WQ_FLAG_EXCLUSIVE)
211 __add_wait_queue_tail(q, wait);
212 else
213 __add_wait_queue(q, wait);
214 }
215 set_current_state(state);
216 spin_unlock_irqrestore(&q->lock, flags);
217
218 return 0;
219}
220EXPORT_SYMBOL(prepare_to_wait_event);
221
95/** 222/**
96 * finish_wait - clean up after waiting in a queue 223 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 224 * @q: waitqueue waited on
diff --git a/kernel/signal.c b/kernel/signal.c
index 50e41075ac77..940b30ee9a30 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
2723 2723
2724#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER 2724#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2725 2725
2726int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) 2726int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2727{ 2727{
2728 int err; 2728 int err;
2729 2729
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3394 new_ka.sa.sa_restorer = compat_ptr(restorer); 3394 new_ka.sa.sa_restorer = compat_ptr(restorer);
3395#endif 3395#endif
3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); 3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3397 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); 3397 ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
3398 if (ret) 3398 if (ret)
3399 return -EFAULT; 3399 return -EFAULT;
3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask); 3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3407 &oact->sa_handler); 3407 &oact->sa_handler);
3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); 3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3409 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 3409 ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3410#ifdef __ARCH_HAS_SA_RESTORER 3410#ifdef __ARCH_HAS_SA_RESTORER
3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), 3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3412 &oact->sa_restorer); 3412 &oact->sa_restorer);
diff --git a/kernel/smp.c b/kernel/smp.c
index 449b707fc20d..bd9f94028838 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -15,9 +15,9 @@
15 15
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19enum { 18enum {
20 CSD_FLAG_LOCK = 0x01, 19 CSD_FLAG_LOCK = 0x01,
20 CSD_FLAG_WAIT = 0x02,
21}; 21};
22 22
23struct call_function_data { 23struct call_function_data {
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48 cpu_to_node(cpu))) 48 cpu_to_node(cpu)))
49 return notifier_from_errno(-ENOMEM); 49 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu))) 51 cpu_to_node(cpu))) {
52 free_cpumask_var(cfd->cpumask);
52 return notifier_from_errno(-ENOMEM); 53 return notifier_from_errno(-ENOMEM);
54 }
53 cfd->csd = alloc_percpu(struct call_single_data); 55 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) { 56 if (!cfd->csd) {
57 free_cpumask_var(cfd->cpumask_ipi);
55 free_cpumask_var(cfd->cpumask); 58 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM); 59 return notifier_from_errno(-ENOMEM);
57 } 60 }
@@ -121,7 +124,7 @@ static void csd_lock(struct call_single_data *csd)
121 124
122static void csd_unlock(struct call_single_data *csd) 125static void csd_unlock(struct call_single_data *csd)
123{ 126{
124 WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); 127 WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
125 128
126 /* 129 /*
127 * ensure we're all done before releasing data: 130 * ensure we're all done before releasing data:
@@ -136,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd)
136 * for execution on the given CPU. data must already have 139 * for execution on the given CPU. data must already have
137 * ->func, ->info, and ->flags set. 140 * ->func, ->info, and ->flags set.
138 */ 141 */
139static 142static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
140void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
141{ 143{
142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 144 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
143 unsigned long flags; 145 unsigned long flags;
144 int ipi; 146 int ipi;
145 147
148 if (wait)
149 csd->flags |= CSD_FLAG_WAIT;
150
146 raw_spin_lock_irqsave(&dst->lock, flags); 151 raw_spin_lock_irqsave(&dst->lock, flags);
147 ipi = list_empty(&dst->list); 152 ipi = list_empty(&dst->list);
148 list_add_tail(&csd->list, &dst->list); 153 list_add_tail(&csd->list, &dst->list);
@@ -337,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
337 } 342 }
338 put_cpu(); 343 put_cpu();
339} 344}
345EXPORT_SYMBOL_GPL(__smp_call_function_single);
340 346
341/** 347/**
342 * smp_call_function_many(): Run a function on a set of other CPUs. 348 * smp_call_function_many(): Run a function on a set of other CPUs.
@@ -456,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
456 return 0; 462 return 0;
457} 463}
458EXPORT_SYMBOL(smp_call_function); 464EXPORT_SYMBOL(smp_call_function);
459#endif /* USE_GENERIC_SMP_HELPERS */
460 465
461/* Setup configured maximum number of CPUs to activate */ 466/* Setup configured maximum number of CPUs to activate */
462unsigned int setup_max_cpus = NR_CPUS; 467unsigned int setup_max_cpus = NR_CPUS;
@@ -521,6 +526,11 @@ void __init setup_nr_cpu_ids(void)
521 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; 526 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
522} 527}
523 528
529void __weak smp_announce(void)
530{
531 printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
532}
533
524/* Called by boot processor to activate the rest. */ 534/* Called by boot processor to activate the rest. */
525void __init smp_init(void) 535void __init smp_init(void)
526{ 536{
@@ -537,7 +547,7 @@ void __init smp_init(void)
537 } 547 }
538 548
539 /* Any cleanup work */ 549 /* Any cleanup work */
540 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 550 smp_announce();
541 smp_cpus_done(setup_max_cpus); 551 smp_cpus_done(setup_max_cpus);
542} 552}
543 553
@@ -572,8 +582,10 @@ EXPORT_SYMBOL(on_each_cpu);
572 * 582 *
573 * If @wait is true, then returns once @func has returned. 583 * If @wait is true, then returns once @func has returned.
574 * 584 *
575 * You must not call this function with disabled interrupts or 585 * You must not call this function with disabled interrupts or from a
576 * from a hardware interrupt handler or from a bottom half handler. 586 * hardware interrupt handler or from a bottom half handler. The
587 * exception is that it may be used during early boot while
588 * early_boot_irqs_disabled is set.
577 */ 589 */
578void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, 590void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
579 void *info, bool wait) 591 void *info, bool wait)
@@ -582,9 +594,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
582 594
583 smp_call_function_many(mask, func, info, wait); 595 smp_call_function_many(mask, func, info, wait);
584 if (cpumask_test_cpu(cpu, mask)) { 596 if (cpumask_test_cpu(cpu, mask)) {
585 local_irq_disable(); 597 unsigned long flags;
598 local_irq_save(flags);
586 func(info); 599 func(info);
587 local_irq_enable(); 600 local_irq_restore(flags);
588 } 601 }
589 put_cpu(); 602 put_cpu();
590} 603}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index be3d3514c325..11025ccc06dd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,8 +6,6 @@
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 *
10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 9 */
12 10
13#include <linux/export.h> 11#include <linux/export.h>
@@ -29,7 +27,6 @@
29#define CREATE_TRACE_POINTS 27#define CREATE_TRACE_POINTS
30#include <trace/events/irq.h> 28#include <trace/events/irq.h>
31 29
32#include <asm/irq.h>
33/* 30/*
34 - No shared variables, all the data are CPU local. 31 - No shared variables, all the data are CPU local.
35 - If a softirq needs serialization, let it serialize itself 32 - If a softirq needs serialization, let it serialize itself
@@ -100,13 +97,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
100 97
101 raw_local_irq_save(flags); 98 raw_local_irq_save(flags);
102 /* 99 /*
103 * The preempt tracer hooks into add_preempt_count and will break 100 * The preempt tracer hooks into preempt_count_add and will break
104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 101 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
105 * is set and before current->softirq_enabled is cleared. 102 * is set and before current->softirq_enabled is cleared.
106 * We must manually increment preempt_count here and manually 103 * We must manually increment preempt_count here and manually
107 * call the trace_preempt_off later. 104 * call the trace_preempt_off later.
108 */ 105 */
109 preempt_count() += cnt; 106 __preempt_count_add(cnt);
110 /* 107 /*
111 * Were softirqs turned off above: 108 * Were softirqs turned off above:
112 */ 109 */
@@ -120,7 +117,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
120#else /* !CONFIG_TRACE_IRQFLAGS */ 117#else /* !CONFIG_TRACE_IRQFLAGS */
121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 118static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
122{ 119{
123 add_preempt_count(cnt); 120 preempt_count_add(cnt);
124 barrier(); 121 barrier();
125} 122}
126#endif /* CONFIG_TRACE_IRQFLAGS */ 123#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +131,11 @@ EXPORT_SYMBOL(local_bh_disable);
134 131
135static void __local_bh_enable(unsigned int cnt) 132static void __local_bh_enable(unsigned int cnt)
136{ 133{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled()); 134 WARN_ON_ONCE(!irqs_disabled());
139 135
140 if (softirq_count() == cnt) 136 if (softirq_count() == cnt)
141 trace_softirqs_on(_RET_IP_); 137 trace_softirqs_on(_RET_IP_);
142 sub_preempt_count(cnt); 138 preempt_count_sub(cnt);
143} 139}
144 140
145/* 141/*
@@ -149,6 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
149 */ 145 */
150void _local_bh_enable(void) 146void _local_bh_enable(void)
151{ 147{
148 WARN_ON_ONCE(in_irq());
152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 149 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153} 150}
154 151
@@ -169,12 +166,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
169 * Keep preemption disabled until we are done with 166 * Keep preemption disabled until we are done with
170 * softirq processing: 167 * softirq processing:
171 */ 168 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 169 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
173 170
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 171 if (unlikely(!in_interrupt() && local_softirq_pending())) {
172 /*
173 * Run softirq if any pending. And do it in its own stack
174 * as we may be calling this deep in a task call stack already.
175 */
175 do_softirq(); 176 do_softirq();
177 }
176 178
177 dec_preempt_count(); 179 preempt_count_dec();
178#ifdef CONFIG_TRACE_IRQFLAGS 180#ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 181 local_irq_enable();
180#endif 182#endif
@@ -256,7 +258,7 @@ restart:
256 " exited with %08x?\n", vec_nr, 258 " exited with %08x?\n", vec_nr,
257 softirq_to_name[vec_nr], h->action, 259 softirq_to_name[vec_nr], h->action,
258 prev_count, preempt_count()); 260 prev_count, preempt_count());
259 preempt_count() = prev_count; 261 preempt_count_set(prev_count);
260 } 262 }
261 263
262 rcu_bh_qs(cpu); 264 rcu_bh_qs(cpu);
@@ -280,10 +282,11 @@ restart:
280 282
281 account_irq_exit_time(current); 283 account_irq_exit_time(current);
282 __local_bh_enable(SOFTIRQ_OFFSET); 284 __local_bh_enable(SOFTIRQ_OFFSET);
285 WARN_ON_ONCE(in_interrupt());
283 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 286 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
284} 287}
285 288
286#ifndef __ARCH_HAS_DO_SOFTIRQ 289
287 290
288asmlinkage void do_softirq(void) 291asmlinkage void do_softirq(void)
289{ 292{
@@ -298,13 +301,11 @@ asmlinkage void do_softirq(void)
298 pending = local_softirq_pending(); 301 pending = local_softirq_pending();
299 302
300 if (pending) 303 if (pending)
301 __do_softirq(); 304 do_softirq_own_stack();
302 305
303 local_irq_restore(flags); 306 local_irq_restore(flags);
304} 307}
305 308
306#endif
307
308/* 309/*
309 * Enter an interrupt context. 310 * Enter an interrupt context.
310 */ 311 */
@@ -328,10 +329,25 @@ void irq_enter(void)
328 329
329static inline void invoke_softirq(void) 330static inline void invoke_softirq(void)
330{ 331{
331 if (!force_irqthreads) 332 if (!force_irqthreads) {
333#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
334 /*
335 * We can safely execute softirq on the current stack if
336 * it is the irq stack, because it should be near empty
337 * at this stage.
338 */
332 __do_softirq(); 339 __do_softirq();
333 else 340#else
341 /*
342 * Otherwise, irq_exit() is called on the task stack that can
343 * be potentially deep already. So call softirq in its own stack
344 * to prevent from any overrun.
345 */
346 do_softirq_own_stack();
347#endif
348 } else {
334 wakeup_softirqd(); 349 wakeup_softirqd();
350 }
335} 351}
336 352
337static inline void tick_irq_exit(void) 353static inline void tick_irq_exit(void)
@@ -360,7 +376,7 @@ void irq_exit(void)
360 376
361 account_irq_exit_time(current); 377 account_irq_exit_time(current);
362 trace_hardirq_exit(); 378 trace_hardirq_exit();
363 sub_preempt_count(HARDIRQ_OFFSET); 379 preempt_count_sub(HARDIRQ_OFFSET);
364 if (!in_interrupt() && local_softirq_pending()) 380 if (!in_interrupt() && local_softirq_pending())
365 invoke_softirq(); 381 invoke_softirq();
366 382
@@ -609,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
609} 625}
610EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); 626EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
611 627
612/*
613 * Remote softirq bits
614 */
615
616DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
617EXPORT_PER_CPU_SYMBOL(softirq_work_list);
618
619static void __local_trigger(struct call_single_data *cp, int softirq)
620{
621 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
622
623 list_add_tail(&cp->list, head);
624
625 /* Trigger the softirq only if the list was previously empty. */
626 if (head->next == &cp->list)
627 raise_softirq_irqoff(softirq);
628}
629
630#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
631static void remote_softirq_receive(void *data)
632{
633 struct call_single_data *cp = data;
634 unsigned long flags;
635 int softirq;
636
637 softirq = *(int *)cp->info;
638 local_irq_save(flags);
639 __local_trigger(cp, softirq);
640 local_irq_restore(flags);
641}
642
643static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
644{
645 if (cpu_online(cpu)) {
646 cp->func = remote_softirq_receive;
647 cp->info = &softirq;
648 cp->flags = 0;
649
650 __smp_call_function_single(cpu, cp, 0);
651 return 0;
652 }
653 return 1;
654}
655#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
656static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
657{
658 return 1;
659}
660#endif
661
662/**
663 * __send_remote_softirq - try to schedule softirq work on a remote cpu
664 * @cp: private SMP call function data area
665 * @cpu: the remote cpu
666 * @this_cpu: the currently executing cpu
667 * @softirq: the softirq for the work
668 *
669 * Attempt to schedule softirq work on a remote cpu. If this cannot be
670 * done, the work is instead queued up on the local cpu.
671 *
672 * Interrupts must be disabled.
673 */
674void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
675{
676 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
677 __local_trigger(cp, softirq);
678}
679EXPORT_SYMBOL(__send_remote_softirq);
680
681/**
682 * send_remote_softirq - try to schedule softirq work on a remote cpu
683 * @cp: private SMP call function data area
684 * @cpu: the remote cpu
685 * @softirq: the softirq for the work
686 *
687 * Like __send_remote_softirq except that disabling interrupts and
688 * computing the current cpu is done for the caller.
689 */
690void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
691{
692 unsigned long flags;
693 int this_cpu;
694
695 local_irq_save(flags);
696 this_cpu = smp_processor_id();
697 __send_remote_softirq(cp, cpu, this_cpu, softirq);
698 local_irq_restore(flags);
699}
700EXPORT_SYMBOL(send_remote_softirq);
701
702static int remote_softirq_cpu_notify(struct notifier_block *self,
703 unsigned long action, void *hcpu)
704{
705 /*
706 * If a CPU goes away, splice its entries to the current CPU
707 * and trigger a run of the softirq
708 */
709 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
710 int cpu = (unsigned long) hcpu;
711 int i;
712
713 local_irq_disable();
714 for (i = 0; i < NR_SOFTIRQS; i++) {
715 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
716 struct list_head *local_head;
717
718 if (list_empty(head))
719 continue;
720
721 local_head = &__get_cpu_var(softirq_work_list[i]);
722 list_splice_init(head, local_head);
723 raise_softirq_irqoff(i);
724 }
725 local_irq_enable();
726 }
727
728 return NOTIFY_OK;
729}
730
731static struct notifier_block remote_softirq_cpu_notifier = {
732 .notifier_call = remote_softirq_cpu_notify,
733};
734
735void __init softirq_init(void) 628void __init softirq_init(void)
736{ 629{
737 int cpu; 630 int cpu;
738 631
739 for_each_possible_cpu(cpu) { 632 for_each_possible_cpu(cpu) {
740 int i;
741
742 per_cpu(tasklet_vec, cpu).tail = 633 per_cpu(tasklet_vec, cpu).tail =
743 &per_cpu(tasklet_vec, cpu).head; 634 &per_cpu(tasklet_vec, cpu).head;
744 per_cpu(tasklet_hi_vec, cpu).tail = 635 per_cpu(tasklet_hi_vec, cpu).tail =
745 &per_cpu(tasklet_hi_vec, cpu).head; 636 &per_cpu(tasklet_hi_vec, cpu).head;
746 for (i = 0; i < NR_SOFTIRQS; i++)
747 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
748 } 637 }
749 638
750 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
751
752 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 639 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
753 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 640 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
754} 641}
@@ -762,6 +649,10 @@ static void run_ksoftirqd(unsigned int cpu)
762{ 649{
763 local_irq_disable(); 650 local_irq_disable();
764 if (local_softirq_pending()) { 651 if (local_softirq_pending()) {
652 /*
653 * We can safely run softirq on inline stack, as we are not deep
654 * in the task stack here.
655 */
765 __do_softirq(); 656 __do_softirq();
766 rcu_note_context_switch(cpu); 657 rcu_note_context_switch(cpu);
767 local_irq_enable(); 658 local_irq_enable();
@@ -876,7 +767,6 @@ int __init __weak early_irq_init(void)
876 return 0; 767 return 0;
877} 768}
878 769
879#ifdef CONFIG_GENERIC_HARDIRQS
880int __init __weak arch_probe_nr_irqs(void) 770int __init __weak arch_probe_nr_irqs(void)
881{ 771{
882 return NR_IRQS_LEGACY; 772 return NR_IRQS_LEGACY;
@@ -886,4 +776,3 @@ int __init __weak arch_early_irq_init(void)
886{ 776{
887 return 0; 777 return 0;
888} 778}
889#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..84571e09c907 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21#include <linux/smpboot.h> 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/lglock.h>
23 24
24/* 25/*
25 * Structure to determine completion condition and record errors. May 26 * Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); 44static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 45static bool stop_machine_initialized = false;
45 46
47/*
48 * Avoids a race between stop_two_cpus and global stop_cpus, where
49 * the stoppers could get queued up in reverse order, leading to
50 * system deadlock. Using an lglock means stop_two_cpus remains
51 * relatively cheap.
52 */
53DEFINE_STATIC_LGLOCK(stop_cpus_lock);
54
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 55static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47{ 56{
48 memset(done, 0, sizeof(*done)); 57 memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
115 return done.executed ? done.ret : -ENOENT; 124 return done.executed ? done.ret : -ENOENT;
116} 125}
117 126
127/* This controls the threads on each CPU. */
128enum multi_stop_state {
129 /* Dummy starting state for thread. */
130 MULTI_STOP_NONE,
131 /* Awaiting everyone to be scheduled. */
132 MULTI_STOP_PREPARE,
133 /* Disable interrupts. */
134 MULTI_STOP_DISABLE_IRQ,
135 /* Run the function */
136 MULTI_STOP_RUN,
137 /* Exit */
138 MULTI_STOP_EXIT,
139};
140
141struct multi_stop_data {
142 int (*fn)(void *);
143 void *data;
144 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
145 unsigned int num_threads;
146 const struct cpumask *active_cpus;
147
148 enum multi_stop_state state;
149 atomic_t thread_ack;
150};
151
152static void set_state(struct multi_stop_data *msdata,
153 enum multi_stop_state newstate)
154{
155 /* Reset ack counter. */
156 atomic_set(&msdata->thread_ack, msdata->num_threads);
157 smp_wmb();
158 msdata->state = newstate;
159}
160
161/* Last one to ack a state moves to the next state. */
162static void ack_state(struct multi_stop_data *msdata)
163{
164 if (atomic_dec_and_test(&msdata->thread_ack))
165 set_state(msdata, msdata->state + 1);
166}
167
168/* This is the cpu_stop function which stops the CPU. */
169static int multi_cpu_stop(void *data)
170{
171 struct multi_stop_data *msdata = data;
172 enum multi_stop_state curstate = MULTI_STOP_NONE;
173 int cpu = smp_processor_id(), err = 0;
174 unsigned long flags;
175 bool is_active;
176
177 /*
178 * When called from stop_machine_from_inactive_cpu(), irq might
179 * already be disabled. Save the state and restore it on exit.
180 */
181 local_save_flags(flags);
182
183 if (!msdata->active_cpus)
184 is_active = cpu == cpumask_first(cpu_online_mask);
185 else
186 is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
187
188 /* Simple state machine */
189 do {
190 /* Chill out and ensure we re-read multi_stop_state. */
191 cpu_relax();
192 if (msdata->state != curstate) {
193 curstate = msdata->state;
194 switch (curstate) {
195 case MULTI_STOP_DISABLE_IRQ:
196 local_irq_disable();
197 hard_irq_disable();
198 break;
199 case MULTI_STOP_RUN:
200 if (is_active)
201 err = msdata->fn(msdata->data);
202 break;
203 default:
204 break;
205 }
206 ack_state(msdata);
207 }
208 } while (curstate != MULTI_STOP_EXIT);
209
210 local_irq_restore(flags);
211 return err;
212}
213
214struct irq_cpu_stop_queue_work_info {
215 int cpu1;
216 int cpu2;
217 struct cpu_stop_work *work1;
218 struct cpu_stop_work *work2;
219};
220
221/*
222 * This function is always run with irqs and preemption disabled.
223 * This guarantees that both work1 and work2 get queued, before
224 * our local migrate thread gets the chance to preempt us.
225 */
226static void irq_cpu_stop_queue_work(void *arg)
227{
228 struct irq_cpu_stop_queue_work_info *info = arg;
229 cpu_stop_queue_work(info->cpu1, info->work1);
230 cpu_stop_queue_work(info->cpu2, info->work2);
231}
232
233/**
234 * stop_two_cpus - stops two cpus
235 * @cpu1: the cpu to stop
236 * @cpu2: the other cpu to stop
237 * @fn: function to execute
238 * @arg: argument to @fn
239 *
240 * Stops both the current and specified CPU and runs @fn on one of them.
241 *
242 * returns when both are completed.
243 */
244int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
245{
246 struct cpu_stop_done done;
247 struct cpu_stop_work work1, work2;
248 struct irq_cpu_stop_queue_work_info call_args;
249 struct multi_stop_data msdata;
250
251 preempt_disable();
252 msdata = (struct multi_stop_data){
253 .fn = fn,
254 .data = arg,
255 .num_threads = 2,
256 .active_cpus = cpumask_of(cpu1),
257 };
258
259 work1 = work2 = (struct cpu_stop_work){
260 .fn = multi_cpu_stop,
261 .arg = &msdata,
262 .done = &done
263 };
264
265 call_args = (struct irq_cpu_stop_queue_work_info){
266 .cpu1 = cpu1,
267 .cpu2 = cpu2,
268 .work1 = &work1,
269 .work2 = &work2,
270 };
271
272 cpu_stop_init_done(&done, 2);
273 set_state(&msdata, MULTI_STOP_PREPARE);
274
275 /*
276 * If we observe both CPUs active we know _cpu_down() cannot yet have
277 * queued its stop_machine works and therefore ours will get executed
278 * first. Or its not either one of our CPUs that's getting unplugged,
279 * in which case we don't care.
280 *
281 * This relies on the stopper workqueues to be FIFO.
282 */
283 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
284 preempt_enable();
285 return -ENOENT;
286 }
287
288 lg_local_lock(&stop_cpus_lock);
289 /*
290 * Queuing needs to be done by the lowest numbered CPU, to ensure
291 * that works are always queued in the same order on every CPU.
292 * This prevents deadlocks.
293 */
294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work,
296 &call_args, 0);
297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable();
299
300 wait_for_completion(&done.completion);
301
302 return done.executed ? done.ret : -ENOENT;
303}
304
118/** 305/**
119 * stop_one_cpu_nowait - stop a cpu but don't wait for completion 306 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
120 * @cpu: cpu to stop 307 * @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 * preempted by a stopper which might wait for other stoppers 346 * preempted by a stopper which might wait for other stoppers
160 * to enter @fn which can lead to deadlock. 347 * to enter @fn which can lead to deadlock.
161 */ 348 */
162 preempt_disable(); 349 lg_global_lock(&stop_cpus_lock);
163 for_each_cpu(cpu, cpumask) 350 for_each_cpu(cpu, cpumask)
164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); 351 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
165 preempt_enable(); 352 lg_global_unlock(&stop_cpus_lock);
166} 353}
167 354
168static int __stop_cpus(const struct cpumask *cpumask, 355static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
359 546
360#ifdef CONFIG_STOP_MACHINE 547#ifdef CONFIG_STOP_MACHINE
361 548
362/* This controls the threads on each CPU. */
363enum stopmachine_state {
364 /* Dummy starting state for thread. */
365 STOPMACHINE_NONE,
366 /* Awaiting everyone to be scheduled. */
367 STOPMACHINE_PREPARE,
368 /* Disable interrupts. */
369 STOPMACHINE_DISABLE_IRQ,
370 /* Run the function */
371 STOPMACHINE_RUN,
372 /* Exit */
373 STOPMACHINE_EXIT,
374};
375
376struct stop_machine_data {
377 int (*fn)(void *);
378 void *data;
379 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
380 unsigned int num_threads;
381 const struct cpumask *active_cpus;
382
383 enum stopmachine_state state;
384 atomic_t thread_ack;
385};
386
387static void set_state(struct stop_machine_data *smdata,
388 enum stopmachine_state newstate)
389{
390 /* Reset ack counter. */
391 atomic_set(&smdata->thread_ack, smdata->num_threads);
392 smp_wmb();
393 smdata->state = newstate;
394}
395
396/* Last one to ack a state moves to the next state. */
397static void ack_state(struct stop_machine_data *smdata)
398{
399 if (atomic_dec_and_test(&smdata->thread_ack))
400 set_state(smdata, smdata->state + 1);
401}
402
403/* This is the cpu_stop function which stops the CPU. */
404static int stop_machine_cpu_stop(void *data)
405{
406 struct stop_machine_data *smdata = data;
407 enum stopmachine_state curstate = STOPMACHINE_NONE;
408 int cpu = smp_processor_id(), err = 0;
409 unsigned long flags;
410 bool is_active;
411
412 /*
413 * When called from stop_machine_from_inactive_cpu(), irq might
414 * already be disabled. Save the state and restore it on exit.
415 */
416 local_save_flags(flags);
417
418 if (!smdata->active_cpus)
419 is_active = cpu == cpumask_first(cpu_online_mask);
420 else
421 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
422
423 /* Simple state machine */
424 do {
425 /* Chill out and ensure we re-read stopmachine_state. */
426 cpu_relax();
427 if (smdata->state != curstate) {
428 curstate = smdata->state;
429 switch (curstate) {
430 case STOPMACHINE_DISABLE_IRQ:
431 local_irq_disable();
432 hard_irq_disable();
433 break;
434 case STOPMACHINE_RUN:
435 if (is_active)
436 err = smdata->fn(smdata->data);
437 break;
438 default:
439 break;
440 }
441 ack_state(smdata);
442 }
443 } while (curstate != STOPMACHINE_EXIT);
444
445 local_irq_restore(flags);
446 return err;
447}
448
449int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 549int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
450{ 550{
451 struct stop_machine_data smdata = { .fn = fn, .data = data, 551 struct multi_stop_data msdata = {
452 .num_threads = num_online_cpus(), 552 .fn = fn,
453 .active_cpus = cpus }; 553 .data = data,
554 .num_threads = num_online_cpus(),
555 .active_cpus = cpus,
556 };
454 557
455 if (!stop_machine_initialized) { 558 if (!stop_machine_initialized) {
456 /* 559 /*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
461 unsigned long flags; 564 unsigned long flags;
462 int ret; 565 int ret;
463 566
464 WARN_ON_ONCE(smdata.num_threads != 1); 567 WARN_ON_ONCE(msdata.num_threads != 1);
465 568
466 local_irq_save(flags); 569 local_irq_save(flags);
467 hard_irq_disable(); 570 hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
472 } 575 }
473 576
474 /* Set the initial state and stop all online cpus. */ 577 /* Set the initial state and stop all online cpus. */
475 set_state(&smdata, STOPMACHINE_PREPARE); 578 set_state(&msdata, MULTI_STOP_PREPARE);
476 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 579 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
477} 580}
478 581
479int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 582int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
513int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 616int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
514 const struct cpumask *cpus) 617 const struct cpumask *cpus)
515{ 618{
516 struct stop_machine_data smdata = { .fn = fn, .data = data, 619 struct multi_stop_data msdata = { .fn = fn, .data = data,
517 .active_cpus = cpus }; 620 .active_cpus = cpus };
518 struct cpu_stop_done done; 621 struct cpu_stop_done done;
519 int ret; 622 int ret;
520 623
521 /* Local CPU must be inactive and CPU hotplug in progress. */ 624 /* Local CPU must be inactive and CPU hotplug in progress. */
522 BUG_ON(cpu_active(raw_smp_processor_id())); 625 BUG_ON(cpu_active(raw_smp_processor_id()));
523 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ 626 msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
524 627
525 /* No proper task established and can't sleep - busy wait for lock. */ 628 /* No proper task established and can't sleep - busy wait for lock. */
526 while (!mutex_trylock(&stop_cpus_mutex)) 629 while (!mutex_trylock(&stop_cpus_mutex))
527 cpu_relax(); 630 cpu_relax();
528 631
529 /* Schedule work on other CPUs and execute directly for local CPU */ 632 /* Schedule work on other CPUs and execute directly for local CPU */
530 set_state(&smdata, STOPMACHINE_PREPARE); 633 set_state(&msdata, MULTI_STOP_PREPARE);
531 cpu_stop_init_done(&done, num_active_cpus()); 634 cpu_stop_init_done(&done, num_active_cpus());
532 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, 635 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
533 &done); 636 &done);
534 ret = stop_machine_cpu_stop(&smdata); 637 ret = multi_cpu_stop(&msdata);
535 638
536 /* Busy wait for completion. */ 639 /* Busy wait for completion. */
537 while (!completion_done(&done.completion)) 640 while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b299f8..c72311324ea7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,7 +16,6 @@
16#include <linux/perf_event.h> 16#include <linux/perf_event.h>
17#include <linux/resource.h> 17#include <linux/resource.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/kexec.h>
20#include <linux/workqueue.h> 19#include <linux/workqueue.h>
21#include <linux/capability.h> 20#include <linux/capability.h>
22#include <linux/device.h> 21#include <linux/device.h>
@@ -337,7 +336,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
337 if (rgid != (gid_t) -1) { 336 if (rgid != (gid_t) -1) {
338 if (gid_eq(old->gid, krgid) || 337 if (gid_eq(old->gid, krgid) ||
339 gid_eq(old->egid, krgid) || 338 gid_eq(old->egid, krgid) ||
340 nsown_capable(CAP_SETGID)) 339 ns_capable(old->user_ns, CAP_SETGID))
341 new->gid = krgid; 340 new->gid = krgid;
342 else 341 else
343 goto error; 342 goto error;
@@ -346,7 +345,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
346 if (gid_eq(old->gid, kegid) || 345 if (gid_eq(old->gid, kegid) ||
347 gid_eq(old->egid, kegid) || 346 gid_eq(old->egid, kegid) ||
348 gid_eq(old->sgid, kegid) || 347 gid_eq(old->sgid, kegid) ||
349 nsown_capable(CAP_SETGID)) 348 ns_capable(old->user_ns, CAP_SETGID))
350 new->egid = kegid; 349 new->egid = kegid;
351 else 350 else
352 goto error; 351 goto error;
@@ -387,7 +386,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
387 old = current_cred(); 386 old = current_cred();
388 387
389 retval = -EPERM; 388 retval = -EPERM;
390 if (nsown_capable(CAP_SETGID)) 389 if (ns_capable(old->user_ns, CAP_SETGID))
391 new->gid = new->egid = new->sgid = new->fsgid = kgid; 390 new->gid = new->egid = new->sgid = new->fsgid = kgid;
392 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 391 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
393 new->egid = new->fsgid = kgid; 392 new->egid = new->fsgid = kgid;
@@ -471,7 +470,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
471 new->uid = kruid; 470 new->uid = kruid;
472 if (!uid_eq(old->uid, kruid) && 471 if (!uid_eq(old->uid, kruid) &&
473 !uid_eq(old->euid, kruid) && 472 !uid_eq(old->euid, kruid) &&
474 !nsown_capable(CAP_SETUID)) 473 !ns_capable(old->user_ns, CAP_SETUID))
475 goto error; 474 goto error;
476 } 475 }
477 476
@@ -480,7 +479,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
480 if (!uid_eq(old->uid, keuid) && 479 if (!uid_eq(old->uid, keuid) &&
481 !uid_eq(old->euid, keuid) && 480 !uid_eq(old->euid, keuid) &&
482 !uid_eq(old->suid, keuid) && 481 !uid_eq(old->suid, keuid) &&
483 !nsown_capable(CAP_SETUID)) 482 !ns_capable(old->user_ns, CAP_SETUID))
484 goto error; 483 goto error;
485 } 484 }
486 485
@@ -534,7 +533,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
534 old = current_cred(); 533 old = current_cred();
535 534
536 retval = -EPERM; 535 retval = -EPERM;
537 if (nsown_capable(CAP_SETUID)) { 536 if (ns_capable(old->user_ns, CAP_SETUID)) {
538 new->suid = new->uid = kuid; 537 new->suid = new->uid = kuid;
539 if (!uid_eq(kuid, old->uid)) { 538 if (!uid_eq(kuid, old->uid)) {
540 retval = set_user(new); 539 retval = set_user(new);
@@ -591,7 +590,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
591 old = current_cred(); 590 old = current_cred();
592 591
593 retval = -EPERM; 592 retval = -EPERM;
594 if (!nsown_capable(CAP_SETUID)) { 593 if (!ns_capable(old->user_ns, CAP_SETUID)) {
595 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 594 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
596 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 595 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
597 goto error; 596 goto error;
@@ -673,7 +672,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
673 old = current_cred(); 672 old = current_cred();
674 673
675 retval = -EPERM; 674 retval = -EPERM;
676 if (!nsown_capable(CAP_SETGID)) { 675 if (!ns_capable(old->user_ns, CAP_SETGID)) {
677 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 676 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
678 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 677 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
679 goto error; 678 goto error;
@@ -744,7 +743,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
744 743
745 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 744 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
746 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 745 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
747 nsown_capable(CAP_SETUID)) { 746 ns_capable(old->user_ns, CAP_SETUID)) {
748 if (!uid_eq(kuid, old->fsuid)) { 747 if (!uid_eq(kuid, old->fsuid)) {
749 new->fsuid = kuid; 748 new->fsuid = kuid;
750 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 749 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -783,7 +782,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
783 782
784 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 783 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
785 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 784 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
786 nsown_capable(CAP_SETGID)) { 785 ns_capable(old->user_ns, CAP_SETGID)) {
787 if (!gid_eq(kgid, old->fsgid)) { 786 if (!gid_eq(kgid, old->fsgid)) {
788 new->fsgid = kgid; 787 new->fsgid = kgid;
789 goto change_okay; 788 goto change_okay;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc468e17..34a604726d0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
190 190
191#ifdef CONFIG_MAGIC_SYSRQ 191#ifdef CONFIG_MAGIC_SYSRQ
192/* Note: sysrq code uses it's own private copy */ 192/* Note: sysrq code uses it's own private copy */
193static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 193static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
194 194
195static int sysrq_sysctl_handler(ctl_table *table, int write, 195static int sysrq_sysctl_handler(ctl_table *table, int write,
196 void __user *buffer, size_t *lenp, 196 void __user *buffer, size_t *lenp,
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373 { 373 {
374 .procname = "numa_balancing_scan_period_reset",
375 .data = &sysctl_numa_balancing_scan_period_reset,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = proc_dointvec,
379 },
380 {
381 .procname = "numa_balancing_scan_period_max_ms", 374 .procname = "numa_balancing_scan_period_max_ms",
382 .data = &sysctl_numa_balancing_scan_period_max, 375 .data = &sysctl_numa_balancing_scan_period_max,
383 .maxlen = sizeof(unsigned int), 376 .maxlen = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 384 .mode = 0644,
392 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
393 }, 386 },
387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
@@ -962,9 +969,10 @@ static struct ctl_table kern_table[] = {
962 { 969 {
963 .procname = "hung_task_check_count", 970 .procname = "hung_task_check_count",
964 .data = &sysctl_hung_task_check_count, 971 .data = &sysctl_hung_task_check_count,
965 .maxlen = sizeof(unsigned long), 972 .maxlen = sizeof(int),
966 .mode = 0644, 973 .mode = 0644,
967 .proc_handler = proc_doulongvec_minmax, 974 .proc_handler = proc_dointvec_minmax,
975 .extra1 = &zero,
968 }, 976 },
969 { 977 {
970 .procname = "hung_task_timeout_secs", 978 .procname = "hung_task_timeout_secs",
@@ -1049,6 +1057,7 @@ static struct ctl_table kern_table[] = {
1049 .maxlen = sizeof(sysctl_perf_event_sample_rate), 1057 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1050 .mode = 0644, 1058 .mode = 0644,
1051 .proc_handler = perf_proc_update_handler, 1059 .proc_handler = perf_proc_update_handler,
1060 .extra1 = &one,
1052 }, 1061 },
1053 { 1062 {
1054 .procname = "perf_cpu_time_max_percent", 1063 .procname = "perf_cpu_time_max_percent",
@@ -1225,7 +1234,7 @@ static struct ctl_table vm_table[] = {
1225 .data = &hugepages_treat_as_movable, 1234 .data = &hugepages_treat_as_movable,
1226 .maxlen = sizeof(int), 1235 .maxlen = sizeof(int),
1227 .mode = 0644, 1236 .mode = 0644,
1228 .proc_handler = hugetlb_treat_movable_handler, 1237 .proc_handler = proc_dointvec,
1229 }, 1238 },
1230 { 1239 {
1231 .procname = "nr_overcommit_hugepages", 1240 .procname = "nr_overcommit_hugepages",
@@ -1471,14 +1480,14 @@ static struct ctl_table fs_table[] = {
1471 { 1480 {
1472 .procname = "inode-nr", 1481 .procname = "inode-nr",
1473 .data = &inodes_stat, 1482 .data = &inodes_stat,
1474 .maxlen = 2*sizeof(int), 1483 .maxlen = 2*sizeof(long),
1475 .mode = 0444, 1484 .mode = 0444,
1476 .proc_handler = proc_nr_inodes, 1485 .proc_handler = proc_nr_inodes,
1477 }, 1486 },
1478 { 1487 {
1479 .procname = "inode-state", 1488 .procname = "inode-state",
1480 .data = &inodes_stat, 1489 .data = &inodes_stat,
1481 .maxlen = 7*sizeof(int), 1490 .maxlen = 7*sizeof(long),
1482 .mode = 0444, 1491 .mode = 0444,
1483 .proc_handler = proc_nr_inodes, 1492 .proc_handler = proc_nr_inodes,
1484 }, 1493 },
@@ -1508,7 +1517,7 @@ static struct ctl_table fs_table[] = {
1508 { 1517 {
1509 .procname = "dentry-state", 1518 .procname = "dentry-state",
1510 .data = &dentry_stat, 1519 .data = &dentry_stat,
1511 .maxlen = 6*sizeof(int), 1520 .maxlen = 6*sizeof(long),
1512 .mode = 0444, 1521 .mode = 0444,
1513 .proc_handler = proc_nr_dentry, 1522 .proc_handler = proc_nr_dentry,
1514 }, 1523 },
@@ -2214,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2214 *i = val; 2223 *i = val;
2215 } else { 2224 } else {
2216 val = convdiv * (*i) / convmul; 2225 val = convdiv * (*i) / convmul;
2217 if (!first) 2226 if (!first) {
2218 err = proc_put_char(&buffer, &left, '\t'); 2227 err = proc_put_char(&buffer, &left, '\t');
2228 if (err)
2229 break;
2230 }
2219 err = proc_put_long(&buffer, &left, val, false); 2231 err = proc_put_long(&buffer, &left, val, false);
2220 if (err) 2232 if (err)
2221 break; 2233 break;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b609213ca9a2..653cbbd9e7ad 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file,
1024 if (get_user(value, vec + i)) 1024 if (get_user(value, vec + i))
1025 goto out_kfree; 1025 goto out_kfree;
1026 1026
1027 str += snprintf(str, end - str, "%lu\t", value); 1027 str += scnprintf(str, end - str, "%lu\t", value);
1028 } 1028 }
1029 1029
1030 result = kernel_write(file, buffer, str - buffer, 0); 1030 result = kernel_write(file, buffer, str - buffer, 0);
@@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file,
1095 if (get_user(value, vec + i)) 1095 if (get_user(value, vec + i))
1096 goto out_kfree; 1096 goto out_kfree;
1097 1097
1098 str += snprintf(str, end - str, "%lu\t", value); 1098 str += scnprintf(str, end - str, "%lu\t", value);
1099 } 1099 }
1100 1100
1101 result = kernel_write(file, buffer, str - buffer, 0); 1101 result = kernel_write(file, buffer, str - buffer, 0);
@@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1205 if (get_user(dnaddr, (__le16 __user *)newval)) 1205 if (get_user(dnaddr, (__le16 __user *)newval))
1206 goto out; 1206 goto out;
1207 1207
1208 len = snprintf(buf, sizeof(buf), "%hu.%hu", 1208 len = scnprintf(buf, sizeof(buf), "%hu.%hu",
1209 le16_to_cpu(dnaddr) >> 10, 1209 le16_to_cpu(dnaddr) >> 10,
1210 le16_to_cpu(dnaddr) & 0x3ff); 1210 le16_to_cpu(dnaddr) & 0x3ff);
1211 1211
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
new file mode 100644
index 000000000000..4aef390671cb
--- /dev/null
+++ b/kernel/system_certificates.S
@@ -0,0 +1,10 @@
1#include <linux/export.h>
2#include <linux/init.h>
3
4 __INITRODATA
5
6 .globl VMLINUX_SYMBOL(system_certificate_list)
7VMLINUX_SYMBOL(system_certificate_list):
8 .incbin "kernel/x509_certificate_list"
9 .globl VMLINUX_SYMBOL(system_certificate_list_end)
10VMLINUX_SYMBOL(system_certificate_list_end):
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
new file mode 100644
index 000000000000..564dd93430a2
--- /dev/null
+++ b/kernel/system_keyring.c
@@ -0,0 +1,105 @@
1/* System trusted keyring for trusted public keys
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/export.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/cred.h>
16#include <linux/err.h>
17#include <keys/asymmetric-type.h>
18#include <keys/system_keyring.h>
19#include "module-internal.h"
20
21struct key *system_trusted_keyring;
22EXPORT_SYMBOL_GPL(system_trusted_keyring);
23
24extern __initconst const u8 system_certificate_list[];
25extern __initconst const u8 system_certificate_list_end[];
26
27/*
28 * Load the compiled-in keys
29 */
30static __init int system_trusted_keyring_init(void)
31{
32 pr_notice("Initialise system trusted keyring\n");
33
34 system_trusted_keyring =
35 keyring_alloc(".system_keyring",
36 KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
37 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
38 KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
39 KEY_ALLOC_NOT_IN_QUOTA, NULL);
40 if (IS_ERR(system_trusted_keyring))
41 panic("Can't allocate system trusted keyring\n");
42
43 set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
44 return 0;
45}
46
47/*
48 * Must be initialised before we try and load the keys into the keyring.
49 */
50device_initcall(system_trusted_keyring_init);
51
52/*
53 * Load the compiled-in list of X.509 certificates.
54 */
55static __init int load_system_certificate_list(void)
56{
57 key_ref_t key;
58 const u8 *p, *end;
59 size_t plen;
60
61 pr_notice("Loading compiled-in X.509 certificates\n");
62
63 end = system_certificate_list_end;
64 p = system_certificate_list;
65 while (p < end) {
66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
67 * than 256 bytes in size.
68 */
69 if (end - p < 4)
70 goto dodgy_cert;
71 if (p[0] != 0x30 &&
72 p[1] != 0x82)
73 goto dodgy_cert;
74 plen = (p[2] << 8) | p[3];
75 plen += 4;
76 if (plen > end - p)
77 goto dodgy_cert;
78
79 key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
80 "asymmetric",
81 NULL,
82 p,
83 plen,
84 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
85 KEY_USR_VIEW | KEY_USR_READ),
86 KEY_ALLOC_NOT_IN_QUOTA |
87 KEY_ALLOC_TRUSTED);
88 if (IS_ERR(key)) {
89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
90 PTR_ERR(key));
91 } else {
92 pr_notice("Loaded X.509 cert '%s'\n",
93 key_ref_to_ptr(key)->description);
94 key_ref_put(key);
95 }
96 p += plen;
97 }
98
99 return 0;
100
101dodgy_cert:
102 pr_err("Problem parsing in-kernel X.509 certificate list\n");
103 return 0;
104}
105late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c92d6f3..8727032e3a6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -4,6 +4,23 @@
4 4
5static struct callback_head work_exited; /* all we need is ->next == NULL */ 5static struct callback_head work_exited; /* all we need is ->next == NULL */
6 6
7/**
8 * task_work_add - ask the @task to execute @work->func()
9 * @task: the task which should run the callback
10 * @work: the callback to run
11 * @notify: send the notification if true
12 *
13 * Queue @work for task_work_run() below and notify the @task if @notify.
14 * Fails if the @task is exiting/exited and thus it can't process this @work.
15 * Otherwise @work->func() will be called when the @task returns from kernel
16 * mode or exits.
17 *
18 * This is like the signal handler which runs in kernel mode, but it doesn't
19 * try to wake up the @task.
20 *
21 * RETURNS:
22 * 0 if succeeds or -ESRCH.
23 */
7int 24int
8task_work_add(struct task_struct *task, struct callback_head *work, bool notify) 25task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
9{ 26{
@@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
21 return 0; 38 return 0;
22} 39}
23 40
41/**
42 * task_work_cancel - cancel a pending work added by task_work_add()
43 * @task: the task which should execute the work
44 * @func: identifies the work to remove
45 *
46 * Find the last queued pending work with ->func == @func and remove
47 * it from queue.
48 *
49 * RETURNS:
50 * The found work or NULL if not found.
51 */
24struct callback_head * 52struct callback_head *
25task_work_cancel(struct task_struct *task, task_work_func_t func) 53task_work_cancel(struct task_struct *task, task_work_func_t func)
26{ 54{
27 struct callback_head **pprev = &task->task_works; 55 struct callback_head **pprev = &task->task_works;
28 struct callback_head *work = NULL; 56 struct callback_head *work;
29 unsigned long flags; 57 unsigned long flags;
30 /* 58 /*
31 * If cmpxchg() fails we continue without updating pprev. 59 * If cmpxchg() fails we continue without updating pprev.
@@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
35 */ 63 */
36 raw_spin_lock_irqsave(&task->pi_lock, flags); 64 raw_spin_lock_irqsave(&task->pi_lock, flags);
37 while ((work = ACCESS_ONCE(*pprev))) { 65 while ((work = ACCESS_ONCE(*pprev))) {
38 read_barrier_depends(); 66 smp_read_barrier_depends();
39 if (work->func != func) 67 if (work->func != func)
40 pprev = &work->next; 68 pprev = &work->next;
41 else if (cmpxchg(pprev, work, work->next) == work) 69 else if (cmpxchg(pprev, work, work->next) == work)
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
46 return work; 74 return work;
47} 75}
48 76
77/**
78 * task_work_run - execute the works added by task_work_add()
79 *
80 * Flush the pending works. Should be used by the core kernel code.
81 * Called before the task returns to the user-mode or stops, or when
82 * it exits. In the latter case task_work_add() can no longer add the
83 * new work after task_work_run() returns.
84 */
49void task_work_run(void) 85void task_work_run(void)
50{ 86{
51 struct task_struct *task = current; 87 struct task_struct *task = current;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 145bb4d3bd4d..13d2f7cd65db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
290 struct listener_list *listeners; 290 struct listener_list *listeners;
291 struct listener *s, *tmp, *s2; 291 struct listener *s, *tmp, *s2;
292 unsigned int cpu; 292 unsigned int cpu;
293 int ret = 0;
293 294
294 if (!cpumask_subset(mask, cpu_possible_mask)) 295 if (!cpumask_subset(mask, cpu_possible_mask))
295 return -EINVAL; 296 return -EINVAL;
@@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
304 for_each_cpu(cpu, mask) { 305 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), 306 s = kmalloc_node(sizeof(struct listener),
306 GFP_KERNEL, cpu_to_node(cpu)); 307 GFP_KERNEL, cpu_to_node(cpu));
307 if (!s) 308 if (!s) {
309 ret = -ENOMEM;
308 goto cleanup; 310 goto cleanup;
309 311 }
310 s->pid = pid; 312 s->pid = pid;
311 s->valid = 1; 313 s->valid = 1;
312 314
@@ -339,7 +341,7 @@ cleanup:
339 } 341 }
340 up_write(&listeners->sem); 342 up_write(&listeners->sem);
341 } 343 }
342 return 0; 344 return ret;
343} 345}
344 346
345static int parse(struct nlattr *na, struct cpumask *mask) 347static int parse(struct nlattr *na, struct cpumask *mask)
@@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
404 if (!na) 406 if (!na)
405 goto err; 407 goto err;
406 408
407 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 409 if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
410 nla_nest_cancel(skb, na);
408 goto err; 411 goto err;
412 }
409 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 413 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
410 if (!ret) 414 if (!ret) {
415 nla_nest_cancel(skb, na);
411 goto err; 416 goto err;
417 }
412 nla_nest_end(skb, na); 418 nla_nest_end(skb, na);
413 419
414 return nla_data(ret); 420 return nla_data(ret);
@@ -667,17 +673,18 @@ err:
667 nlmsg_free(rep_skb); 673 nlmsg_free(rep_skb);
668} 674}
669 675
670static struct genl_ops taskstats_ops = { 676static const struct genl_ops taskstats_ops[] = {
671 .cmd = TASKSTATS_CMD_GET, 677 {
672 .doit = taskstats_user_cmd, 678 .cmd = TASKSTATS_CMD_GET,
673 .policy = taskstats_cmd_get_policy, 679 .doit = taskstats_user_cmd,
674 .flags = GENL_ADMIN_PERM, 680 .policy = taskstats_cmd_get_policy,
675}; 681 .flags = GENL_ADMIN_PERM,
676 682 },
677static struct genl_ops cgroupstats_ops = { 683 {
678 .cmd = CGROUPSTATS_CMD_GET, 684 .cmd = CGROUPSTATS_CMD_GET,
679 .doit = cgroupstats_user_cmd, 685 .doit = cgroupstats_user_cmd,
680 .policy = cgroupstats_cmd_get_policy, 686 .policy = cgroupstats_cmd_get_policy,
687 },
681}; 688};
682 689
683/* Needed early in initialization */ 690/* Needed early in initialization */
@@ -696,26 +703,13 @@ static int __init taskstats_init(void)
696{ 703{
697 int rc; 704 int rc;
698 705
699 rc = genl_register_family(&family); 706 rc = genl_register_family_with_ops(&family, taskstats_ops);
700 if (rc) 707 if (rc)
701 return rc; 708 return rc;
702 709
703 rc = genl_register_ops(&family, &taskstats_ops);
704 if (rc < 0)
705 goto err;
706
707 rc = genl_register_ops(&family, &cgroupstats_ops);
708 if (rc < 0)
709 goto err_cgroup_ops;
710
711 family_registered = 1; 710 family_registered = 1;
712 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 711 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
713 return 0; 712 return 0;
714err_cgroup_ops:
715 genl_unregister_ops(&family, &taskstats_ops);
716err:
717 genl_unregister_family(&family);
718 return rc;
719} 713}
720 714
721/* 715/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2b62fe86f9ec..3ce6e8c5f3fc 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
100 # RCU_USER_QS dependency 100 # RCU_USER_QS dependency
101 depends on HAVE_CONTEXT_TRACKING 101 depends on HAVE_CONTEXT_TRACKING
102 # VIRT_CPU_ACCOUNTING_GEN dependency 102 # VIRT_CPU_ACCOUNTING_GEN dependency
103 depends on 64BIT 103 depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
104 select NO_HZ_COMMON 104 select NO_HZ_COMMON
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fcef9e4..88c9c65a430d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; 490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
491 491
492 if (!alarmtimer_get_rtcdev()) 492 if (!alarmtimer_get_rtcdev())
493 return -ENOTSUPP; 493 return -EINVAL;
494 494
495 return hrtimer_get_res(baseid, tp); 495 return hrtimer_get_res(baseid, tp);
496} 496}
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; 507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
508 508
509 if (!alarmtimer_get_rtcdev()) 509 if (!alarmtimer_get_rtcdev())
510 return -ENOTSUPP; 510 return -EINVAL;
511 511
512 *tp = ktime_to_timespec(base->gettime()); 512 *tp = ktime_to_timespec(base->gettime());
513 return 0; 513 return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 38959c866789..086ad6043bcb 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -33,29 +33,64 @@ struct ce_unbind {
33 int res; 33 int res;
34}; 34};
35 35
36/** 36static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 bool ismax)
38 * @latch: value to convert
39 * @evt: pointer to clock event device descriptor
40 *
41 * Math helper, returns latch value converted to nanoseconds (bound checked)
42 */
43u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
44{ 38{
45 u64 clc = (u64) latch << evt->shift; 39 u64 clc = (u64) latch << evt->shift;
40 u64 rnd;
46 41
47 if (unlikely(!evt->mult)) { 42 if (unlikely(!evt->mult)) {
48 evt->mult = 1; 43 evt->mult = 1;
49 WARN_ON(1); 44 WARN_ON(1);
50 } 45 }
46 rnd = (u64) evt->mult - 1;
47
48 /*
49 * Upper bound sanity check. If the backwards conversion is
50 * not equal latch, we know that the above shift overflowed.
51 */
52 if ((clc >> evt->shift) != (u64)latch)
53 clc = ~0ULL;
54
55 /*
56 * Scaled math oddities:
57 *
58 * For mult <= (1 << shift) we can safely add mult - 1 to
59 * prevent integer rounding loss. So the backwards conversion
60 * from nsec to device ticks will be correct.
61 *
62 * For mult > (1 << shift), i.e. device frequency is > 1GHz we
63 * need to be careful. Adding mult - 1 will result in a value
64 * which when converted back to device ticks can be larger
65 * than latch by up to (mult - 1) >> shift. For the min_delta
66 * calculation we still want to apply this in order to stay
67 * above the minimum device ticks limit. For the upper limit
68 * we would end up with a latch value larger than the upper
69 * limit of the device, so we omit the add to stay below the
70 * device upper boundary.
71 *
72 * Also omit the add if it would overflow the u64 boundary.
73 */
74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift)))
76 clc += rnd;
51 77
52 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
53 if (clc < 1000)
54 clc = 1000;
55 if (clc > KTIME_MAX)
56 clc = KTIME_MAX;
57 79
58 return clc; 80 /* Deltas less than 1usec are pointless noise */
81 return clc > 1000 ? clc : 1000;
82}
83
84/**
85 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
86 * @latch: value to convert
87 * @evt: pointer to clock event device descriptor
88 *
89 * Math helper, returns latch value converted to nanoseconds (bound checked)
90 */
91u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
92{
93 return cev_delta2ns(latch, evt, false);
59} 94}
60EXPORT_SYMBOL_GPL(clockevent_delta2ns); 95EXPORT_SYMBOL_GPL(clockevent_delta2ns);
61 96
@@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)
380 sec = 600; 415 sec = 600;
381 416
382 clockevents_calc_mult_shift(dev, freq, sec); 417 clockevents_calc_mult_shift(dev, freq, sec);
383 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); 418 dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
384 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); 419 dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
385} 420}
386 421
387/** 422/**
@@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
584 const char *buf, size_t count) 619 const char *buf, size_t count)
585{ 620{
586 char name[CS_NAME_LEN]; 621 char name[CS_NAME_LEN];
587 size_t ret = sysfs_get_uname(buf, name, count); 622 ssize_t ret = sysfs_get_uname(buf, name, count);
588 struct clock_event_device *ce; 623 struct clock_event_device *ce;
589 624
590 if (ret < 0) 625 if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736757f3..ba3e502c955a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
479static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
480static inline int __clocksource_watchdog_kthread(void) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } 481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
482void clocksource_mark_unstable(struct clocksource *cs) { }
482 483
483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 484#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
484 485
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
537} 538}
538 539
539/** 540/**
540 * clocksource_max_deferment - Returns max time the clocksource can be deferred 541 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
541 * @cs: Pointer to clocksource 542 * @mult: cycle to nanosecond multiplier
542 * 543 * @shift: cycle to nanosecond divisor (power of two)
544 * @maxadj: maximum adjustment value to mult (~11%)
545 * @mask: bitmask for two's complement subtraction of non 64 bit counters
543 */ 546 */
544static u64 clocksource_max_deferment(struct clocksource *cs) 547u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
545{ 548{
546 u64 max_nsecs, max_cycles; 549 u64 max_nsecs, max_cycles;
547 550
548 /* 551 /*
549 * Calculate the maximum number of cycles that we can pass to the 552 * Calculate the maximum number of cycles that we can pass to the
550 * cyc2ns function without overflowing a 64-bit signed result. The 553 * cyc2ns function without overflowing a 64-bit signed result. The
551 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) 554 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
552 * which is equivalent to the below. 555 * which is equivalent to the below.
553 * max_cycles < (2^63)/(cs->mult + cs->maxadj) 556 * max_cycles < (2^63)/(mult + maxadj)
554 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) 557 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
555 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) 558 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
556 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) 559 * max_cycles < 2^(63 - log2(mult + maxadj))
557 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) 560 * max_cycles < 1 << (63 - log2(mult + maxadj))
558 * Please note that we add 1 to the result of the log2 to account for 561 * Please note that we add 1 to the result of the log2 to account for
559 * any rounding errors, ensure the above inequality is satisfied and 562 * any rounding errors, ensure the above inequality is satisfied and
560 * no overflow will occur. 563 * no overflow will occur.
561 */ 564 */
562 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); 565 max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
563 566
564 /* 567 /*
565 * The actual maximum number of cycles we can defer the clocksource is 568 * The actual maximum number of cycles we can defer the clocksource is
566 * determined by the minimum of max_cycles and cs->mask. 569 * determined by the minimum of max_cycles and mask.
567 * Note: Here we subtract the maxadj to make sure we don't sleep for 570 * Note: Here we subtract the maxadj to make sure we don't sleep for
568 * too long if there's a large negative adjustment. 571 * too long if there's a large negative adjustment.
569 */ 572 */
570 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 573 max_cycles = min(max_cycles, mask);
571 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, 574 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
572 cs->shift); 575
576 return max_nsecs;
577}
578
579/**
580 * clocksource_max_deferment - Returns max time the clocksource can be deferred
581 * @cs: Pointer to clocksource
582 *
583 */
584static u64 clocksource_max_deferment(struct clocksource *cs)
585{
586 u64 max_nsecs;
573 587
588 max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
589 cs->mask);
574 /* 590 /*
575 * To ensure that the clocksource does not wrap whilst we are idle, 591 * To ensure that the clocksource does not wrap whilst we are idle,
576 * limit the time the clocksource can be deferred by 12.5%. Please 592 * limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
893 return count; 909 return count;
894} 910}
895 911
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) 912ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{ 913{
898 size_t ret = cnt; 914 size_t ret = cnt;
899 915
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
924 struct device_attribute *attr, 940 struct device_attribute *attr,
925 const char *buf, size_t count) 941 const char *buf, size_t count)
926{ 942{
927 size_t ret; 943 ssize_t ret;
928 944
929 mutex_lock(&clocksource_mutex); 945 mutex_lock(&clocksource_mutex);
930 946
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
952{ 968{
953 struct clocksource *cs; 969 struct clocksource *cs;
954 char name[CS_NAME_LEN]; 970 char name[CS_NAME_LEN];
955 size_t ret; 971 ssize_t ret;
956 972
957 ret = sysfs_get_uname(buf, name, count); 973 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0) 974 if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577b..af8d1d4f3d55 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
475 * called as close as possible to 500 ms before the new second starts. 475 * called as close as possible to 500 ms before the new second starts.
476 * This code is run on a timer. If the clock is set, that timer 476 * This code is run on a timer. If the clock is set, that timer
477 * may not expire at the correct time. Thus, we adjust... 477 * may not expire at the correct time. Thus, we adjust...
478 * We want the clock to be within a couple of ticks from the target.
478 */ 479 */
479 if (!ntp_synced()) { 480 if (!ntp_synced()) {
480 /* 481 /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
485 } 486 }
486 487
487 getnstimeofday(&now); 488 getnstimeofday(&now);
488 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { 489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
489 struct timespec adjust = now; 490 struct timespec adjust = now;
490 491
491 fail = -ENODEV; 492 fail = -ENODEV;
@@ -516,13 +517,13 @@ static void sync_cmos_clock(struct work_struct *work)
516 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); 517 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
517} 518}
518 519
519static void notify_cmos_timer(void) 520void ntp_notify_cmos_timer(void)
520{ 521{
521 schedule_delayed_work(&sync_cmos_work, 0); 522 schedule_delayed_work(&sync_cmos_work, 0);
522} 523}
523 524
524#else 525#else
525static inline void notify_cmos_timer(void) { } 526void ntp_notify_cmos_timer(void) { }
526#endif 527#endif
527 528
528 529
@@ -687,8 +688,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
687 if (!(time_status & STA_NANO)) 688 if (!(time_status & STA_NANO))
688 txc->time.tv_usec /= NSEC_PER_USEC; 689 txc->time.tv_usec /= NSEC_PER_USEC;
689 690
690 notify_cmos_timer();
691
692 return result; 691 return result;
693} 692}
694 693
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0b479a6a22bb..68b799375981 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
8#include <linux/clocksource.h> 8#include <linux/clocksource.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/ktime.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/syscore_ops.h> 15#include <linux/syscore_ops.h>
15#include <linux/timer.h> 16#include <linux/hrtimer.h>
16#include <linux/sched_clock.h> 17#include <linux/sched_clock.h>
18#include <linux/seqlock.h>
19#include <linux/bitops.h>
17 20
18struct clock_data { 21struct clock_data {
22 ktime_t wrap_kt;
19 u64 epoch_ns; 23 u64 epoch_ns;
20 u32 epoch_cyc; 24 u64 epoch_cyc;
21 u32 epoch_cyc_copy; 25 seqcount_t seq;
22 unsigned long rate; 26 unsigned long rate;
23 u32 mult; 27 u32 mult;
24 u32 shift; 28 u32 shift;
25 bool suspended; 29 bool suspended;
26}; 30};
27 31
28static void sched_clock_poll(unsigned long wrap_ticks); 32static struct hrtimer sched_clock_timer;
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1; 33static int irqtime = -1;
31 34
32core_param(irqtime, irqtime, int, 0400); 35core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ, 38 .mult = NSEC_PER_SEC / HZ,
36}; 39};
37 40
38static u32 __read_mostly sched_clock_mask = 0xffffffff; 41static u64 __read_mostly sched_clock_mask;
39 42
40static u32 notrace jiffy_sched_clock_read(void) 43static u64 notrace jiffy_sched_clock_read(void)
41{ 44{
42 return (u32)(jiffies - INITIAL_JIFFIES); 45 /*
46 * We don't need to use get_jiffies_64 on 32-bit arches here
47 * because we register with BITS_PER_LONG
48 */
49 return (u64)(jiffies - INITIAL_JIFFIES);
43} 50}
44 51
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46 60
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{ 62{
49 return (cyc * mult) >> shift; 63 return (cyc * mult) >> shift;
50} 64}
51 65
52static unsigned long long notrace sched_clock_32(void) 66unsigned long long notrace sched_clock(void)
53{ 67{
54 u64 epoch_ns; 68 u64 epoch_ns;
55 u32 epoch_cyc; 69 u64 epoch_cyc;
56 u32 cyc; 70 u64 cyc;
71 unsigned long seq;
57 72
58 if (cd.suspended) 73 if (cd.suspended)
59 return cd.epoch_ns; 74 return cd.epoch_ns;
60 75
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do { 76 do {
77 seq = read_seqcount_begin(&cd.seq);
69 epoch_cyc = cd.epoch_cyc; 78 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns; 79 epoch_ns = cd.epoch_ns;
72 smp_rmb(); 80 } while (read_seqcount_retry(&cd.seq, seq));
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74 81
75 cyc = read_sched_clock(); 82 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask; 83 cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
83static void notrace update_sched_clock(void) 90static void notrace update_sched_clock(void)
84{ 91{
85 unsigned long flags; 92 unsigned long flags;
86 u32 cyc; 93 u64 cyc;
87 u64 ns; 94 u64 ns;
88 95
89 cyc = read_sched_clock(); 96 cyc = read_sched_clock();
90 ns = cd.epoch_ns + 97 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, 98 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift); 99 cd.mult, cd.shift);
93 /* 100
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc; 102 write_seqcount_begin(&cd.seq);
99 smp_wmb();
100 cd.epoch_ns = ns; 103 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc; 104 cd.epoch_cyc = cyc;
105 write_seqcount_end(&cd.seq);
103 raw_local_irq_restore(flags); 106 raw_local_irq_restore(flags);
104} 107}
105 108
106static void sched_clock_poll(unsigned long wrap_ticks) 109static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
107{ 110{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock(); 111 update_sched_clock();
112 hrtimer_forward_now(hrt, cd.wrap_kt);
113 return HRTIMER_RESTART;
110} 114}
111 115
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) 116void __init sched_clock_register(u64 (*read)(void), int bits,
117 unsigned long rate)
113{ 118{
114 unsigned long r, w; 119 unsigned long r;
115 u64 res, wrap; 120 u64 res, wrap;
116 char r_unit; 121 char r_unit;
117 122
118 if (cd.rate > rate) 123 if (cd.rate > rate)
119 return; 124 return;
120 125
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled()); 126 WARN_ON(!irqs_disabled());
123 read_sched_clock = read; 127 read_sched_clock = read;
124 sched_clock_mask = (1ULL << bits) - 1; 128 sched_clock_mask = CLOCKSOURCE_MASK(bits);
125 cd.rate = rate; 129 cd.rate = rate;
126 130
127 /* calculate the mult/shift to convert counter ticks to ns. */ 131 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); 132 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
129 133
130 r = rate; 134 r = rate;
131 if (r >= 4000000) { 135 if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
138 r_unit = ' '; 142 r_unit = ' ';
139 143
140 /* calculate how many ns until we wrap */ 144 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); 145 wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
142 do_div(wrap, NSEC_PER_MSEC); 146 cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
143 w = wrap;
144 147
145 /* calculate the ns resolution of this counter */ 148 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift); 149 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", 150 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
148 bits, r, r_unit, res, w); 151 bits, r, r_unit, res, wrap);
149 152
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock(); 153 update_sched_clock();
156 154
157 /* 155 /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
166 pr_debug("Registered %pF as sched_clock source\n", read); 164 pr_debug("Registered %pF as sched_clock source\n", read);
167} 165}
168 166
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; 167void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
170
171unsigned long long notrace sched_clock(void)
172{ 168{
173 return sched_clock_func(); 169 read_sched_clock_32 = read;
170 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
174} 171}
175 172
176void __init sched_clock_postinit(void) 173void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
180 * make it the final one one. 177 * make it the final one one.
181 */ 178 */
182 if (read_sched_clock == jiffy_sched_clock_read) 179 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ); 180 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
184 181
185 sched_clock_poll(sched_clock_timer.data); 182 update_sched_clock();
183
184 /*
185 * Start the timer to keep sched_clock() properly updated and
186 * sets the initial epoch.
187 */
188 hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
189 sched_clock_timer.function = sched_clock_poll;
190 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
186} 191}
187 192
188static int sched_clock_suspend(void) 193static int sched_clock_suspend(void)
189{ 194{
190 sched_clock_poll(sched_clock_timer.data); 195 sched_clock_poll(&sched_clock_timer);
191 cd.suspended = true; 196 cd.suspended = true;
192 return 0; 197 return 0;
193} 198}
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
195static void sched_clock_resume(void) 200static void sched_clock_resume(void)
196{ 201{
197 cd.epoch_cyc = read_sched_clock(); 202 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false; 203 cd.suspended = false;
200} 204}
201 205
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb565fed..9532690daaa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev) 70 struct clock_event_device *newdev)
71{ 71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || 72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP)) 74 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false; 75 return false;
75 76
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906cad709b..18e71f7fbc2a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
31 31
32extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
33 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 34extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35 35
36/* 36/*
37 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 48b9fffabdc2..3abf53418b67 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1613 * ktime_get_update_offsets - hrtimer helper 1613 * ktime_get_update_offsets - hrtimer helper
1614 * @offs_real: pointer to storage for monotonic -> realtime offset 1614 * @offs_real: pointer to storage for monotonic -> realtime offset
1615 * @offs_boot: pointer to storage for monotonic -> boottime offset 1615 * @offs_boot: pointer to storage for monotonic -> boottime offset
1616 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1616 * 1617 *
1617 * Returns current monotonic time and updates the offsets 1618 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interupt() or retrigger_next_event() 1619 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1620 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1621ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1622 ktime_t *offs_tai)
@@ -1703,6 +1704,8 @@ int do_adjtimex(struct timex *txc)
1703 write_seqcount_end(&timekeeper_seq); 1704 write_seqcount_end(&timekeeper_seq);
1704 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1705 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1705 1706
1707 ntp_notify_cmos_timer();
1708
1706 return ret; 1709 return ret;
1707} 1710}
1708 1711
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b559..1fb08f21302e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
298 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
299 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
300 300
301 seq_puts(m, "Timer Stats Version: v0.2\n"); 301 seq_puts(m, "Timer Stats Version: v0.3\n");
302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
303 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
304 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
305 atomic_read(&overflow_count)); 305 seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
306 306
307 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
308 entry = entries + i; 308 entry = entries + i;
309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ", 310 seq_printf(m, "%4luD, %5d %-16s ",
311 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else { 312 } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1093 unsigned long data) 1093 unsigned long data)
1094{ 1094{
1095 int preempt_count = preempt_count(); 1095 int count = preempt_count();
1096 1096
1097#ifdef CONFIG_LOCKDEP 1097#ifdef CONFIG_LOCKDEP
1098 /* 1098 /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1119 1119
1120 lock_map_release(&lockdep_map); 1120 lock_map_release(&lockdep_map);
1121 1121
1122 if (preempt_count != preempt_count()) { 1122 if (count != preempt_count()) {
1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1124 fn, preempt_count, preempt_count()); 1124 fn, count, preempt_count());
1125 /* 1125 /*
1126 * Restore the preempt count. That gives us a decent 1126 * Restore the preempt count. That gives us a decent
1127 * chance to survive and extract information. If the 1127 * chance to survive and extract information. If the
1128 * callback kept a lock held, bad luck, but not worse 1128 * callback kept a lock held, bad luck, but not worse
1129 * than the BUG() we had. 1129 * than the BUG() we had.
1130 */ 1130 */
1131 preempt_count() = preempt_count; 1131 preempt_count_set(count);
1132 } 1132 }
1133} 1133}
1134 1134
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b8b8560bfb95..f785aef65799 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,6 +26,7 @@
26#include <linux/export.h> 26#include <linux/export.h>
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/list.h>
29 30
30#include <trace/events/block.h> 31#include <trace/events/block.h>
31 32
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;
38static struct trace_array *blk_tr; 39static struct trace_array *blk_tr;
39static bool blk_tracer_enabled __read_mostly; 40static bool blk_tracer_enabled __read_mostly;
40 41
42static LIST_HEAD(running_trace_list);
43static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
44
41/* Select an alternative, minimalistic output than the original one */ 45/* Select an alternative, minimalistic output than the original one */
42#define TRACE_BLK_OPT_CLASSIC 0x1 46#define TRACE_BLK_OPT_CLASSIC 0x1
43 47
@@ -107,10 +111,18 @@ record_it:
107 * Send out a notify for this process, if we haven't done so since a trace 111 * Send out a notify for this process, if we haven't done so since a trace
108 * started 112 * started
109 */ 113 */
110static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) 114static void trace_note_tsk(struct task_struct *tsk)
111{ 115{
116 unsigned long flags;
117 struct blk_trace *bt;
118
112 tsk->btrace_seq = blktrace_seq; 119 tsk->btrace_seq = blktrace_seq;
113 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); 120 spin_lock_irqsave(&running_trace_lock, flags);
121 list_for_each_entry(bt, &running_trace_list, running_list) {
122 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
123 sizeof(tsk->comm));
124 }
125 spin_unlock_irqrestore(&running_trace_lock, flags);
114} 126}
115 127
116static void trace_note_time(struct blk_trace *bt) 128static void trace_note_time(struct blk_trace *bt)
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
229 goto record_it; 241 goto record_it;
230 } 242 }
231 243
244 if (unlikely(tsk->btrace_seq != blktrace_seq))
245 trace_note_tsk(tsk);
246
232 /* 247 /*
233 * A word about the locking here - we disable interrupts to reserve 248 * A word about the locking here - we disable interrupts to reserve
234 * some space in the relay per-cpu buffer, to prevent an irq 249 * some space in the relay per-cpu buffer, to prevent an irq
235 * from coming in and stepping on our toes. 250 * from coming in and stepping on our toes.
236 */ 251 */
237 local_irq_save(flags); 252 local_irq_save(flags);
238
239 if (unlikely(tsk->btrace_seq != blktrace_seq))
240 trace_note_tsk(bt, tsk);
241
242 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); 253 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
243 if (t) { 254 if (t) {
244 sequence = per_cpu_ptr(bt->sequence, cpu); 255 sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
477 bt->dir = dir; 488 bt->dir = dir;
478 bt->dev = dev; 489 bt->dev = dev;
479 atomic_set(&bt->dropped, 0); 490 atomic_set(&bt->dropped, 0);
491 INIT_LIST_HEAD(&bt->running_list);
480 492
481 ret = -EIO; 493 ret = -EIO;
482 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, 494 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
567 .end_lba = cbuts.end_lba, 579 .end_lba = cbuts.end_lba,
568 .pid = cbuts.pid, 580 .pid = cbuts.pid,
569 }; 581 };
570 memcpy(&buts.name, &cbuts.name, 32);
571 582
572 ret = do_blk_trace_setup(q, name, dev, bdev, &buts); 583 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
573 if (ret) 584 if (ret)
574 return ret; 585 return ret;
575 586
576 if (copy_to_user(arg, &buts.name, 32)) { 587 if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
577 blk_trace_remove(q); 588 blk_trace_remove(q);
578 return -EFAULT; 589 return -EFAULT;
579 } 590 }
@@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
601 blktrace_seq++; 612 blktrace_seq++;
602 smp_mb(); 613 smp_mb();
603 bt->trace_state = Blktrace_running; 614 bt->trace_state = Blktrace_running;
615 spin_lock_irq(&running_trace_lock);
616 list_add(&bt->running_list, &running_trace_list);
617 spin_unlock_irq(&running_trace_lock);
604 618
605 trace_note_time(bt); 619 trace_note_time(bt);
606 ret = 0; 620 ret = 0;
@@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
608 } else { 622 } else {
609 if (bt->trace_state == Blktrace_running) { 623 if (bt->trace_state == Blktrace_running) {
610 bt->trace_state = Blktrace_stopped; 624 bt->trace_state = Blktrace_stopped;
625 spin_lock_irq(&running_trace_lock);
626 list_del_init(&bt->running_list);
627 spin_unlock_irq(&running_trace_lock);
611 relay_flush(bt->rchan); 628 relay_flush(bt->rchan);
612 ret = 0; 629 ret = 0;
613 } 630 }
@@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
1472 if (atomic_dec_and_test(&blk_probes_ref)) 1489 if (atomic_dec_and_test(&blk_probes_ref))
1473 blk_unregister_tracepoints(); 1490 blk_unregister_tracepoints();
1474 1491
1492 spin_lock_irq(&running_trace_lock);
1493 list_del(&bt->running_list);
1494 spin_unlock_irq(&running_trace_lock);
1475 blk_trace_free(bt); 1495 blk_trace_free(bt);
1476 return 0; 1496 return 0;
1477} 1497}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a6d098c6df3f..22fa55696760 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1978 1978
1979void ftrace_modify_all_code(int command) 1979void ftrace_modify_all_code(int command)
1980{ 1980{
1981 int update = command & FTRACE_UPDATE_TRACE_FUNC;
1982
1983 /*
1984 * If the ftrace_caller calls a ftrace_ops func directly,
1985 * we need to make sure that it only traces functions it
1986 * expects to trace. When doing the switch of functions,
1987 * we need to update to the ftrace_ops_list_func first
1988 * before the transition between old and new calls are set,
1989 * as the ftrace_ops_list_func will check the ops hashes
1990 * to make sure the ops are having the right functions
1991 * traced.
1992 */
1993 if (update)
1994 ftrace_update_ftrace_func(ftrace_ops_list_func);
1995
1981 if (command & FTRACE_UPDATE_CALLS) 1996 if (command & FTRACE_UPDATE_CALLS)
1982 ftrace_replace_code(1); 1997 ftrace_replace_code(1);
1983 else if (command & FTRACE_DISABLE_CALLS) 1998 else if (command & FTRACE_DISABLE_CALLS)
1984 ftrace_replace_code(0); 1999 ftrace_replace_code(0);
1985 2000
1986 if (command & FTRACE_UPDATE_TRACE_FUNC) 2001 if (update && ftrace_trace_function != ftrace_ops_list_func)
1987 ftrace_update_ftrace_func(ftrace_trace_function); 2002 ftrace_update_ftrace_func(ftrace_trace_function);
1988 2003
1989 if (command & FTRACE_START_FUNC_RET) 2004 if (command & FTRACE_START_FUNC_RET)
@@ -3292,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob)
3292static LIST_HEAD(ftrace_commands); 3307static LIST_HEAD(ftrace_commands);
3293static DEFINE_MUTEX(ftrace_cmd_mutex); 3308static DEFINE_MUTEX(ftrace_cmd_mutex);
3294 3309
3295int register_ftrace_command(struct ftrace_func_command *cmd) 3310/*
3311 * Currently we only register ftrace commands from __init, so mark this
3312 * __init too.
3313 */
3314__init int register_ftrace_command(struct ftrace_func_command *cmd)
3296{ 3315{
3297 struct ftrace_func_command *p; 3316 struct ftrace_func_command *p;
3298 int ret = 0; 3317 int ret = 0;
@@ -3311,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd)
3311 return ret; 3330 return ret;
3312} 3331}
3313 3332
3314int unregister_ftrace_command(struct ftrace_func_command *cmd) 3333/*
3334 * Currently we only unregister ftrace commands from __init, so mark
3335 * this __init too.
3336 */
3337__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
3315{ 3338{
3316 struct ftrace_func_command *p, *n; 3339 struct ftrace_func_command *p, *n;
3317 int ret = -ENODEV; 3340 int ret = -ENODEV;
@@ -3626,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
3626 3649
3627#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3650#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3628static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 3651static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3629static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 3652static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3630 3653
3631static int __init set_graph_function(char *str) 3654static int __init set_graph_function(char *str)
3632{ 3655{
@@ -3644,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf)
3644 func = strsep(&buf, ","); 3667 func = strsep(&buf, ",");
3645 /* we allow only one expression at a time */ 3668 /* we allow only one expression at a time */
3646 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 3669 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
3647 func); 3670 FTRACE_GRAPH_MAX_FUNCS, func);
3648 if (ret) 3671 if (ret)
3649 printk(KERN_DEBUG "ftrace: function %s not " 3672 printk(KERN_DEBUG "ftrace: function %s not "
3650 "traceable\n", func); 3673 "traceable\n", func);
@@ -3761,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = {
3761static DEFINE_MUTEX(graph_lock); 3784static DEFINE_MUTEX(graph_lock);
3762 3785
3763int ftrace_graph_count; 3786int ftrace_graph_count;
3764int ftrace_graph_filter_enabled; 3787int ftrace_graph_notrace_count;
3765unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 3788unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
3789unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
3790
3791struct ftrace_graph_data {
3792 unsigned long *table;
3793 size_t size;
3794 int *count;
3795 const struct seq_operations *seq_ops;
3796};
3766 3797
3767static void * 3798static void *
3768__g_next(struct seq_file *m, loff_t *pos) 3799__g_next(struct seq_file *m, loff_t *pos)
3769{ 3800{
3770 if (*pos >= ftrace_graph_count) 3801 struct ftrace_graph_data *fgd = m->private;
3802
3803 if (*pos >= *fgd->count)
3771 return NULL; 3804 return NULL;
3772 return &ftrace_graph_funcs[*pos]; 3805 return &fgd->table[*pos];
3773} 3806}
3774 3807
3775static void * 3808static void *
@@ -3781,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos)
3781 3814
3782static void *g_start(struct seq_file *m, loff_t *pos) 3815static void *g_start(struct seq_file *m, loff_t *pos)
3783{ 3816{
3817 struct ftrace_graph_data *fgd = m->private;
3818
3784 mutex_lock(&graph_lock); 3819 mutex_lock(&graph_lock);
3785 3820
3786 /* Nothing, tell g_show to print all functions are enabled */ 3821 /* Nothing, tell g_show to print all functions are enabled */
3787 if (!ftrace_graph_filter_enabled && !*pos) 3822 if (!*fgd->count && !*pos)
3788 return (void *)1; 3823 return (void *)1;
3789 3824
3790 return __g_next(m, pos); 3825 return __g_next(m, pos);
@@ -3820,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = {
3820}; 3855};
3821 3856
3822static int 3857static int
3823ftrace_graph_open(struct inode *inode, struct file *file) 3858__ftrace_graph_open(struct inode *inode, struct file *file,
3859 struct ftrace_graph_data *fgd)
3824{ 3860{
3825 int ret = 0; 3861 int ret = 0;
3826 3862
3827 if (unlikely(ftrace_disabled))
3828 return -ENODEV;
3829
3830 mutex_lock(&graph_lock); 3863 mutex_lock(&graph_lock);
3831 if ((file->f_mode & FMODE_WRITE) && 3864 if ((file->f_mode & FMODE_WRITE) &&
3832 (file->f_flags & O_TRUNC)) { 3865 (file->f_flags & O_TRUNC)) {
3833 ftrace_graph_filter_enabled = 0; 3866 *fgd->count = 0;
3834 ftrace_graph_count = 0; 3867 memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
3835 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
3836 } 3868 }
3837 mutex_unlock(&graph_lock); 3869 mutex_unlock(&graph_lock);
3838 3870
3839 if (file->f_mode & FMODE_READ) 3871 if (file->f_mode & FMODE_READ) {
3840 ret = seq_open(file, &ftrace_graph_seq_ops); 3872 ret = seq_open(file, fgd->seq_ops);
3873 if (!ret) {
3874 struct seq_file *m = file->private_data;
3875 m->private = fgd;
3876 }
3877 } else
3878 file->private_data = fgd;
3841 3879
3842 return ret; 3880 return ret;
3843} 3881}
3844 3882
3845static int 3883static int
3884ftrace_graph_open(struct inode *inode, struct file *file)
3885{
3886 struct ftrace_graph_data *fgd;
3887
3888 if (unlikely(ftrace_disabled))
3889 return -ENODEV;
3890
3891 fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
3892 if (fgd == NULL)
3893 return -ENOMEM;
3894
3895 fgd->table = ftrace_graph_funcs;
3896 fgd->size = FTRACE_GRAPH_MAX_FUNCS;
3897 fgd->count = &ftrace_graph_count;
3898 fgd->seq_ops = &ftrace_graph_seq_ops;
3899
3900 return __ftrace_graph_open(inode, file, fgd);
3901}
3902
3903static int
3904ftrace_graph_notrace_open(struct inode *inode, struct file *file)
3905{
3906 struct ftrace_graph_data *fgd;
3907
3908 if (unlikely(ftrace_disabled))
3909 return -ENODEV;
3910
3911 fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
3912 if (fgd == NULL)
3913 return -ENOMEM;
3914
3915 fgd->table = ftrace_graph_notrace_funcs;
3916 fgd->size = FTRACE_GRAPH_MAX_FUNCS;
3917 fgd->count = &ftrace_graph_notrace_count;
3918 fgd->seq_ops = &ftrace_graph_seq_ops;
3919
3920 return __ftrace_graph_open(inode, file, fgd);
3921}
3922
3923static int
3846ftrace_graph_release(struct inode *inode, struct file *file) 3924ftrace_graph_release(struct inode *inode, struct file *file)
3847{ 3925{
3848 if (file->f_mode & FMODE_READ) 3926 if (file->f_mode & FMODE_READ) {
3927 struct seq_file *m = file->private_data;
3928
3929 kfree(m->private);
3849 seq_release(inode, file); 3930 seq_release(inode, file);
3931 } else {
3932 kfree(file->private_data);
3933 }
3934
3850 return 0; 3935 return 0;
3851} 3936}
3852 3937
3853static int 3938static int
3854ftrace_set_func(unsigned long *array, int *idx, char *buffer) 3939ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
3855{ 3940{
3856 struct dyn_ftrace *rec; 3941 struct dyn_ftrace *rec;
3857 struct ftrace_page *pg; 3942 struct ftrace_page *pg;
@@ -3864,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3864 3949
3865 /* decode regex */ 3950 /* decode regex */
3866 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3951 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
3867 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3952 if (!not && *idx >= size)
3868 return -EBUSY; 3953 return -EBUSY;
3869 3954
3870 search_len = strlen(search); 3955 search_len = strlen(search);
@@ -3892,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3892 fail = 0; 3977 fail = 0;
3893 if (!exists) { 3978 if (!exists) {
3894 array[(*idx)++] = rec->ip; 3979 array[(*idx)++] = rec->ip;
3895 if (*idx >= FTRACE_GRAPH_MAX_FUNCS) 3980 if (*idx >= size)
3896 goto out; 3981 goto out;
3897 } 3982 }
3898 } else { 3983 } else {
@@ -3910,8 +3995,6 @@ out:
3910 if (fail) 3995 if (fail)
3911 return -EINVAL; 3996 return -EINVAL;
3912 3997
3913 ftrace_graph_filter_enabled = !!(*idx);
3914
3915 return 0; 3998 return 0;
3916} 3999}
3917 4000
@@ -3920,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
3920 size_t cnt, loff_t *ppos) 4003 size_t cnt, loff_t *ppos)
3921{ 4004{
3922 struct trace_parser parser; 4005 struct trace_parser parser;
3923 ssize_t read, ret; 4006 ssize_t read, ret = 0;
4007 struct ftrace_graph_data *fgd = file->private_data;
3924 4008
3925 if (!cnt) 4009 if (!cnt)
3926 return 0; 4010 return 0;
3927 4011
3928 mutex_lock(&graph_lock); 4012 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
3929 4013 return -ENOMEM;
3930 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
3931 ret = -ENOMEM;
3932 goto out_unlock;
3933 }
3934 4014
3935 read = trace_get_user(&parser, ubuf, cnt, ppos); 4015 read = trace_get_user(&parser, ubuf, cnt, ppos);
3936 4016
3937 if (read >= 0 && trace_parser_loaded((&parser))) { 4017 if (read >= 0 && trace_parser_loaded((&parser))) {
3938 parser.buffer[parser.idx] = 0; 4018 parser.buffer[parser.idx] = 0;
3939 4019
4020 mutex_lock(&graph_lock);
4021
3940 /* we allow only one expression at a time */ 4022 /* we allow only one expression at a time */
3941 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, 4023 ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
3942 parser.buffer); 4024 parser.buffer);
3943 if (ret) 4025
3944 goto out_free; 4026 mutex_unlock(&graph_lock);
3945 } 4027 }
3946 4028
3947 ret = read; 4029 if (!ret)
4030 ret = read;
3948 4031
3949out_free:
3950 trace_parser_put(&parser); 4032 trace_parser_put(&parser);
3951out_unlock:
3952 mutex_unlock(&graph_lock);
3953 4033
3954 return ret; 4034 return ret;
3955} 4035}
@@ -3961,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = {
3961 .llseek = ftrace_filter_lseek, 4041 .llseek = ftrace_filter_lseek,
3962 .release = ftrace_graph_release, 4042 .release = ftrace_graph_release,
3963}; 4043};
4044
4045static const struct file_operations ftrace_graph_notrace_fops = {
4046 .open = ftrace_graph_notrace_open,
4047 .read = seq_read,
4048 .write = ftrace_graph_write,
4049 .llseek = ftrace_filter_lseek,
4050 .release = ftrace_graph_release,
4051};
3964#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4052#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3965 4053
3966static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4054static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
@@ -3982,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3982 trace_create_file("set_graph_function", 0444, d_tracer, 4070 trace_create_file("set_graph_function", 0444, d_tracer,
3983 NULL, 4071 NULL,
3984 &ftrace_graph_fops); 4072 &ftrace_graph_fops);
4073 trace_create_file("set_graph_notrace", 0444, d_tracer,
4074 NULL,
4075 &ftrace_graph_notrace_fops);
3985#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4076#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3986 4077
3987 return 0; 4078 return 0;
@@ -4305,12 +4396,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4305 */ 4396 */
4306 preempt_disable_notrace(); 4397 preempt_disable_notrace();
4307 trace_recursion_set(TRACE_CONTROL_BIT); 4398 trace_recursion_set(TRACE_CONTROL_BIT);
4399
4400 /*
4401 * Control funcs (perf) uses RCU. Only trace if
4402 * RCU is currently active.
4403 */
4404 if (!rcu_is_watching())
4405 goto out;
4406
4308 do_for_each_ftrace_op(op, ftrace_control_list) { 4407 do_for_each_ftrace_op(op, ftrace_control_list) {
4309 if (!(op->flags & FTRACE_OPS_FL_STUB) && 4408 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4310 !ftrace_function_local_disabled(op) && 4409 !ftrace_function_local_disabled(op) &&
4311 ftrace_ops_test(op, ip, regs)) 4410 ftrace_ops_test(op, ip, regs))
4312 op->func(ip, parent_ip, op, regs); 4411 op->func(ip, parent_ip, op, regs);
4313 } while_for_each_ftrace_op(op); 4412 } while_for_each_ftrace_op(op);
4413 out:
4314 trace_recursion_clear(TRACE_CONTROL_BIT); 4414 trace_recursion_clear(TRACE_CONTROL_BIT);
4315 preempt_enable_notrace(); 4415 preempt_enable_notrace();
4316} 4416}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 496f94d57698..9d20cd9743ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr)
235 mutex_unlock(&trace_types_lock); 235 mutex_unlock(&trace_types_lock);
236} 236}
237 237
238int filter_current_check_discard(struct ring_buffer *buffer, 238int filter_check_discard(struct ftrace_event_file *file, void *rec,
239 struct ftrace_event_call *call, void *rec, 239 struct ring_buffer *buffer,
240 struct ring_buffer_event *event) 240 struct ring_buffer_event *event)
241{ 241{
242 return filter_check_discard(call, rec, buffer, event); 242 if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
243 !filter_match_preds(file->filter, rec)) {
244 ring_buffer_discard_commit(buffer, event);
245 return 1;
246 }
247
248 return 0;
243} 249}
244EXPORT_SYMBOL_GPL(filter_current_check_discard); 250EXPORT_SYMBOL_GPL(filter_check_discard);
251
252int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
253 struct ring_buffer *buffer,
254 struct ring_buffer_event *event)
255{
256 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
257 !filter_match_preds(call->filter, rec)) {
258 ring_buffer_discard_commit(buffer, event);
259 return 1;
260 }
261
262 return 0;
263}
264EXPORT_SYMBOL_GPL(call_filter_check_discard);
245 265
246cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) 266cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
247{ 267{
@@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
843 if (isspace(ch)) { 863 if (isspace(ch)) {
844 parser->buffer[parser->idx] = 0; 864 parser->buffer[parser->idx] = 0;
845 parser->cont = false; 865 parser->cont = false;
846 } else { 866 } else if (parser->idx < parser->size - 1) {
847 parser->cont = true; 867 parser->cont = true;
848 parser->buffer[parser->idx++] = ch; 868 parser->buffer[parser->idx++] = ch;
869 } else {
870 ret = -EINVAL;
871 goto out;
849 } 872 }
850 873
851 *ppos += read; 874 *ppos += read;
@@ -1261,21 +1284,6 @@ int is_tracing_stopped(void)
1261} 1284}
1262 1285
1263/** 1286/**
1264 * ftrace_off_permanent - disable all ftrace code permanently
1265 *
1266 * This should only be called when a serious anomally has
1267 * been detected. This will turn off the function tracing,
1268 * ring buffers, and other tracing utilites. It takes no
1269 * locks and can be called from any context.
1270 */
1271void ftrace_off_permanent(void)
1272{
1273 tracing_disabled = 1;
1274 ftrace_stop();
1275 tracing_off_permanent();
1276}
1277
1278/**
1279 * tracing_start - quick start of the tracer 1287 * tracing_start - quick start of the tracer
1280 * 1288 *
1281 * If tracing is enabled but was stopped by tracing_stop, 1289 * If tracing is enabled but was stopped by tracing_stop,
@@ -1509,7 +1517,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1509#endif 1517#endif
1510 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 1518 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
1511 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 1519 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
1512 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 1520 (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
1521 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
1513} 1522}
1514EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 1523EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
1515 1524
@@ -1630,7 +1639,7 @@ trace_function(struct trace_array *tr,
1630 entry->ip = ip; 1639 entry->ip = ip;
1631 entry->parent_ip = parent_ip; 1640 entry->parent_ip = parent_ip;
1632 1641
1633 if (!filter_check_discard(call, entry, buffer, event)) 1642 if (!call_filter_check_discard(call, entry, buffer, event))
1634 __buffer_unlock_commit(buffer, event); 1643 __buffer_unlock_commit(buffer, event);
1635} 1644}
1636 1645
@@ -1714,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1714 1723
1715 entry->size = trace.nr_entries; 1724 entry->size = trace.nr_entries;
1716 1725
1717 if (!filter_check_discard(call, entry, buffer, event)) 1726 if (!call_filter_check_discard(call, entry, buffer, event))
1718 __buffer_unlock_commit(buffer, event); 1727 __buffer_unlock_commit(buffer, event);
1719 1728
1720 out: 1729 out:
@@ -1816,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1816 trace.entries = entry->caller; 1825 trace.entries = entry->caller;
1817 1826
1818 save_stack_trace_user(&trace); 1827 save_stack_trace_user(&trace);
1819 if (!filter_check_discard(call, entry, buffer, event)) 1828 if (!call_filter_check_discard(call, entry, buffer, event))
1820 __buffer_unlock_commit(buffer, event); 1829 __buffer_unlock_commit(buffer, event);
1821 1830
1822 out_drop_count: 1831 out_drop_count:
@@ -2008,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
2008 entry->fmt = fmt; 2017 entry->fmt = fmt;
2009 2018
2010 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 2019 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
2011 if (!filter_check_discard(call, entry, buffer, event)) { 2020 if (!call_filter_check_discard(call, entry, buffer, event)) {
2012 __buffer_unlock_commit(buffer, event); 2021 __buffer_unlock_commit(buffer, event);
2013 ftrace_trace_stack(buffer, flags, 6, pc); 2022 ftrace_trace_stack(buffer, flags, 6, pc);
2014 } 2023 }
@@ -2063,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2063 2072
2064 memcpy(&entry->buf, tbuffer, len); 2073 memcpy(&entry->buf, tbuffer, len);
2065 entry->buf[len] = '\0'; 2074 entry->buf[len] = '\0';
2066 if (!filter_check_discard(call, entry, buffer, event)) { 2075 if (!call_filter_check_discard(call, entry, buffer, event)) {
2067 __buffer_unlock_commit(buffer, event); 2076 __buffer_unlock_commit(buffer, event);
2068 ftrace_trace_stack(buffer, flags, 6, pc); 2077 ftrace_trace_stack(buffer, flags, 6, pc);
2069 } 2078 }
@@ -2760,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m)
2760 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2769 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2761 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2770 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2762 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2771 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
2763 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); 2772 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
2764 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2773 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2765 seq_printf(m, "# is not a '0' or '1')\n"); 2774 seq_printf(m, "# is not a '0' or '1')\n");
2766} 2775}
@@ -2964,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2964 return 0; 2973 return 0;
2965} 2974}
2966 2975
2976bool tracing_is_disabled(void)
2977{
2978 return (tracing_disabled) ? true: false;
2979}
2980
2967/* 2981/*
2968 * Open and update trace_array ref count. 2982 * Open and update trace_array ref count.
2969 * Must have the current trace_array passed to it. 2983 * Must have the current trace_array passed to it.
@@ -3166,11 +3180,6 @@ static const struct file_operations show_traces_fops = {
3166}; 3180};
3167 3181
3168/* 3182/*
3169 * Only trace on a CPU if the bitmask is set:
3170 */
3171static cpumask_var_t tracing_cpumask;
3172
3173/*
3174 * The tracer itself will not take this lock, but still we want 3183 * The tracer itself will not take this lock, but still we want
3175 * to provide a consistent cpumask to user-space: 3184 * to provide a consistent cpumask to user-space:
3176 */ 3185 */
@@ -3186,11 +3195,12 @@ static ssize_t
3186tracing_cpumask_read(struct file *filp, char __user *ubuf, 3195tracing_cpumask_read(struct file *filp, char __user *ubuf,
3187 size_t count, loff_t *ppos) 3196 size_t count, loff_t *ppos)
3188{ 3197{
3198 struct trace_array *tr = file_inode(filp)->i_private;
3189 int len; 3199 int len;
3190 3200
3191 mutex_lock(&tracing_cpumask_update_lock); 3201 mutex_lock(&tracing_cpumask_update_lock);
3192 3202
3193 len = cpumask_scnprintf(mask_str, count, tracing_cpumask); 3203 len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
3194 if (count - len < 2) { 3204 if (count - len < 2) {
3195 count = -EINVAL; 3205 count = -EINVAL;
3196 goto out_err; 3206 goto out_err;
@@ -3208,7 +3218,7 @@ static ssize_t
3208tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3218tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3209 size_t count, loff_t *ppos) 3219 size_t count, loff_t *ppos)
3210{ 3220{
3211 struct trace_array *tr = filp->private_data; 3221 struct trace_array *tr = file_inode(filp)->i_private;
3212 cpumask_var_t tracing_cpumask_new; 3222 cpumask_var_t tracing_cpumask_new;
3213 int err, cpu; 3223 int err, cpu;
3214 3224
@@ -3228,12 +3238,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3228 * Increase/decrease the disabled counter if we are 3238 * Increase/decrease the disabled counter if we are
3229 * about to flip a bit in the cpumask: 3239 * about to flip a bit in the cpumask:
3230 */ 3240 */
3231 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3241 if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
3232 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3242 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
3233 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); 3243 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
3234 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); 3244 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
3235 } 3245 }
3236 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3246 if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
3237 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3247 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
3238 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); 3248 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
3239 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); 3249 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
@@ -3242,7 +3252,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3242 arch_spin_unlock(&ftrace_max_lock); 3252 arch_spin_unlock(&ftrace_max_lock);
3243 local_irq_enable(); 3253 local_irq_enable();
3244 3254
3245 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 3255 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
3246 3256
3247 mutex_unlock(&tracing_cpumask_update_lock); 3257 mutex_unlock(&tracing_cpumask_update_lock);
3248 free_cpumask_var(tracing_cpumask_new); 3258 free_cpumask_var(tracing_cpumask_new);
@@ -3256,9 +3266,10 @@ err_unlock:
3256} 3266}
3257 3267
3258static const struct file_operations tracing_cpumask_fops = { 3268static const struct file_operations tracing_cpumask_fops = {
3259 .open = tracing_open_generic, 3269 .open = tracing_open_generic_tr,
3260 .read = tracing_cpumask_read, 3270 .read = tracing_cpumask_read,
3261 .write = tracing_cpumask_write, 3271 .write = tracing_cpumask_write,
3272 .release = tracing_release_generic_tr,
3262 .llseek = generic_file_llseek, 3273 .llseek = generic_file_llseek,
3263}; 3274};
3264 3275
@@ -5457,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = {
5457 .func = ftrace_trace_snapshot_callback, 5468 .func = ftrace_trace_snapshot_callback,
5458}; 5469};
5459 5470
5460static int register_snapshot_cmd(void) 5471static __init int register_snapshot_cmd(void)
5461{ 5472{
5462 return register_ftrace_command(&ftrace_snapshot_cmd); 5473 return register_ftrace_command(&ftrace_snapshot_cmd);
5463} 5474}
5464#else 5475#else
5465static inline int register_snapshot_cmd(void) { return 0; } 5476static inline __init int register_snapshot_cmd(void) { return 0; }
5466#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ 5477#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5467 5478
5468struct dentry *tracing_init_dentry_tr(struct trace_array *tr) 5479struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
@@ -5938,6 +5949,11 @@ static int new_instance_create(const char *name)
5938 if (!tr->name) 5949 if (!tr->name)
5939 goto out_free_tr; 5950 goto out_free_tr;
5940 5951
5952 if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
5953 goto out_free_tr;
5954
5955 cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
5956
5941 raw_spin_lock_init(&tr->start_lock); 5957 raw_spin_lock_init(&tr->start_lock);
5942 5958
5943 tr->current_trace = &nop_trace; 5959 tr->current_trace = &nop_trace;
@@ -5969,6 +5985,7 @@ static int new_instance_create(const char *name)
5969 out_free_tr: 5985 out_free_tr:
5970 if (tr->trace_buffer.buffer) 5986 if (tr->trace_buffer.buffer)
5971 ring_buffer_free(tr->trace_buffer.buffer); 5987 ring_buffer_free(tr->trace_buffer.buffer);
5988 free_cpumask_var(tr->tracing_cpumask);
5972 kfree(tr->name); 5989 kfree(tr->name);
5973 kfree(tr); 5990 kfree(tr);
5974 5991
@@ -6098,6 +6115,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6098{ 6115{
6099 int cpu; 6116 int cpu;
6100 6117
6118 trace_create_file("tracing_cpumask", 0644, d_tracer,
6119 tr, &tracing_cpumask_fops);
6120
6101 trace_create_file("trace_options", 0644, d_tracer, 6121 trace_create_file("trace_options", 0644, d_tracer,
6102 tr, &tracing_iter_fops); 6122 tr, &tracing_iter_fops);
6103 6123
@@ -6147,9 +6167,6 @@ static __init int tracer_init_debugfs(void)
6147 6167
6148 init_tracer_debugfs(&global_trace, d_tracer); 6168 init_tracer_debugfs(&global_trace, d_tracer);
6149 6169
6150 trace_create_file("tracing_cpumask", 0644, d_tracer,
6151 &global_trace, &tracing_cpumask_fops);
6152
6153 trace_create_file("available_tracers", 0444, d_tracer, 6170 trace_create_file("available_tracers", 0444, d_tracer,
6154 &global_trace, &show_traces_fops); 6171 &global_trace, &show_traces_fops);
6155 6172
@@ -6250,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter)
6250 iter->trace = iter->tr->current_trace; 6267 iter->trace = iter->tr->current_trace;
6251 iter->cpu_file = RING_BUFFER_ALL_CPUS; 6268 iter->cpu_file = RING_BUFFER_ALL_CPUS;
6252 iter->trace_buffer = &global_trace.trace_buffer; 6269 iter->trace_buffer = &global_trace.trace_buffer;
6270
6271 if (iter->trace && iter->trace->open)
6272 iter->trace->open(iter);
6273
6274 /* Annotate start of buffers if we had overruns */
6275 if (ring_buffer_overruns(iter->trace_buffer->buffer))
6276 iter->iter_flags |= TRACE_FILE_ANNOTATE;
6277
6278 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
6279 if (trace_clocks[iter->tr->clock_id].in_ns)
6280 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
6253} 6281}
6254 6282
6255void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) 6283void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
@@ -6371,7 +6399,7 @@ __init static int tracer_alloc_buffers(void)
6371 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6399 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6372 goto out; 6400 goto out;
6373 6401
6374 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 6402 if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
6375 goto out_free_buffer_mask; 6403 goto out_free_buffer_mask;
6376 6404
6377 /* Only allocate trace_printk buffers if a trace_printk exists */ 6405 /* Only allocate trace_printk buffers if a trace_printk exists */
@@ -6386,7 +6414,7 @@ __init static int tracer_alloc_buffers(void)
6386 ring_buf_size = 1; 6414 ring_buf_size = 1;
6387 6415
6388 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6416 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
6389 cpumask_copy(tracing_cpumask, cpu_all_mask); 6417 cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);
6390 6418
6391 raw_spin_lock_init(&global_trace.start_lock); 6419 raw_spin_lock_init(&global_trace.start_lock);
6392 6420
@@ -6441,7 +6469,7 @@ out_free_cpumask:
6441#ifdef CONFIG_TRACER_MAX_TRACE 6469#ifdef CONFIG_TRACER_MAX_TRACE
6442 free_percpu(global_trace.max_buffer.data); 6470 free_percpu(global_trace.max_buffer.data);
6443#endif 6471#endif
6444 free_cpumask_var(tracing_cpumask); 6472 free_cpumask_var(global_trace.tracing_cpumask);
6445out_free_buffer_mask: 6473out_free_buffer_mask:
6446 free_cpumask_var(tracing_buffer_mask); 6474 free_cpumask_var(tracing_buffer_mask);
6447out: 6475out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fe39acd4c1aa..ea189e027b80 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
124 TRACE_FLAG_NEED_RESCHED = 0x04, 124 TRACE_FLAG_NEED_RESCHED = 0x04,
125 TRACE_FLAG_HARDIRQ = 0x08, 125 TRACE_FLAG_HARDIRQ = 0x08,
126 TRACE_FLAG_SOFTIRQ = 0x10, 126 TRACE_FLAG_SOFTIRQ = 0x10,
127 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
127}; 128};
128 129
129#define TRACE_BUF_SIZE 1024 130#define TRACE_BUF_SIZE 1024
@@ -192,8 +193,8 @@ struct trace_array {
192#ifdef CONFIG_FTRACE_SYSCALLS 193#ifdef CONFIG_FTRACE_SYSCALLS
193 int sys_refcount_enter; 194 int sys_refcount_enter;
194 int sys_refcount_exit; 195 int sys_refcount_exit;
195 DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 196 struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
196 DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 197 struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
197#endif 198#endif
198 int stop_count; 199 int stop_count;
199 int clock_id; 200 int clock_id;
@@ -206,6 +207,7 @@ struct trace_array {
206 struct dentry *event_dir; 207 struct dentry *event_dir;
207 struct list_head systems; 208 struct list_head systems;
208 struct list_head events; 209 struct list_head events;
210 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
209 int ref; 211 int ref;
210}; 212};
211 213
@@ -513,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
513void tracing_reset_current(int cpu); 515void tracing_reset_current(int cpu);
514void tracing_reset_all_online_cpus(void); 516void tracing_reset_all_online_cpus(void);
515int tracing_open_generic(struct inode *inode, struct file *filp); 517int tracing_open_generic(struct inode *inode, struct file *filp);
518bool tracing_is_disabled(void);
516struct dentry *trace_create_file(const char *name, 519struct dentry *trace_create_file(const char *name,
517 umode_t mode, 520 umode_t mode,
518 struct dentry *parent, 521 struct dentry *parent,
@@ -710,6 +713,8 @@ extern unsigned long trace_flags;
710#define TRACE_GRAPH_PRINT_PROC 0x8 713#define TRACE_GRAPH_PRINT_PROC 0x8
711#define TRACE_GRAPH_PRINT_DURATION 0x10 714#define TRACE_GRAPH_PRINT_DURATION 0x10
712#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 715#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
716#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
717#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
713 718
714extern enum print_line_t 719extern enum print_line_t
715print_graph_function_flags(struct trace_iterator *iter, u32 flags); 720print_graph_function_flags(struct trace_iterator *iter, u32 flags);
@@ -729,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr,
729#ifdef CONFIG_DYNAMIC_FTRACE 734#ifdef CONFIG_DYNAMIC_FTRACE
730/* TODO: make this variable */ 735/* TODO: make this variable */
731#define FTRACE_GRAPH_MAX_FUNCS 32 736#define FTRACE_GRAPH_MAX_FUNCS 32
732extern int ftrace_graph_filter_enabled;
733extern int ftrace_graph_count; 737extern int ftrace_graph_count;
734extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 738extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
739extern int ftrace_graph_notrace_count;
740extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
735 741
736static inline int ftrace_graph_addr(unsigned long addr) 742static inline int ftrace_graph_addr(unsigned long addr)
737{ 743{
738 int i; 744 int i;
739 745
740 if (!ftrace_graph_filter_enabled) 746 if (!ftrace_graph_count)
741 return 1; 747 return 1;
742 748
743 for (i = 0; i < ftrace_graph_count; i++) { 749 for (i = 0; i < ftrace_graph_count; i++) {
@@ -757,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr)
757 763
758 return 0; 764 return 0;
759} 765}
766
767static inline int ftrace_graph_notrace_addr(unsigned long addr)
768{
769 int i;
770
771 if (!ftrace_graph_notrace_count)
772 return 0;
773
774 for (i = 0; i < ftrace_graph_notrace_count; i++) {
775 if (addr == ftrace_graph_notrace_funcs[i])
776 return 1;
777 }
778
779 return 0;
780}
760#else 781#else
761static inline int ftrace_graph_addr(unsigned long addr) 782static inline int ftrace_graph_addr(unsigned long addr)
762{ 783{
763 return 1; 784 return 1;
764} 785}
786
787static inline int ftrace_graph_notrace_addr(unsigned long addr)
788{
789 return 0;
790}
765#endif /* CONFIG_DYNAMIC_FTRACE */ 791#endif /* CONFIG_DYNAMIC_FTRACE */
766#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 792#else /* CONFIG_FUNCTION_GRAPH_TRACER */
767static inline enum print_line_t 793static inline enum print_line_t
@@ -985,9 +1011,9 @@ struct filter_pred {
985 1011
986extern enum regex_type 1012extern enum regex_type
987filter_parse_regex(char *buff, int len, char **search, int *not); 1013filter_parse_regex(char *buff, int len, char **search, int *not);
988extern void print_event_filter(struct ftrace_event_call *call, 1014extern void print_event_filter(struct ftrace_event_file *file,
989 struct trace_seq *s); 1015 struct trace_seq *s);
990extern int apply_event_filter(struct ftrace_event_call *call, 1016extern int apply_event_filter(struct ftrace_event_file *file,
991 char *filter_string); 1017 char *filter_string);
992extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, 1018extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
993 char *filter_string); 1019 char *filter_string);
@@ -998,20 +1024,6 @@ extern int filter_assign_type(const char *type);
998struct ftrace_event_field * 1024struct ftrace_event_field *
999trace_find_event_field(struct ftrace_event_call *call, char *name); 1025trace_find_event_field(struct ftrace_event_call *call, char *name);
1000 1026
1001static inline int
1002filter_check_discard(struct ftrace_event_call *call, void *rec,
1003 struct ring_buffer *buffer,
1004 struct ring_buffer_event *event)
1005{
1006 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
1007 !filter_match_preds(call->filter, rec)) {
1008 ring_buffer_discard_commit(buffer, event);
1009 return 1;
1010 }
1011
1012 return 0;
1013}
1014
1015extern void trace_event_enable_cmd_record(bool enable); 1027extern void trace_event_enable_cmd_record(bool enable);
1016extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); 1028extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1017extern int event_trace_del_tracer(struct trace_array *tr); 1029extern int event_trace_del_tracer(struct trace_array *tr);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index d594da0dc03c..697fb9bac8f0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
78 entry->line = f->line; 78 entry->line = f->line;
79 entry->correct = val == expect; 79 entry->correct = val == expect;
80 80
81 if (!filter_check_discard(call, entry, buffer, event)) 81 if (!call_filter_check_discard(call, entry, buffer, event))
82 __buffer_unlock_commit(buffer, event); 82 __buffer_unlock_commit(buffer, event);
83 83
84 out: 84 out:
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bcf66e8..78e27e3b52ac 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
26{ 26{
27 /* The ftrace function trace is allowed only for root. */ 27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) && 28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 29 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
30 return -EPERM; 30 return -EPERM;
31 31
32 /* No tracing, just counting, so no obvious leak */ 32 /* No tracing, just counting, so no obvious leak */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29a7ebcfb426..f919a2e21bf3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -989,7 +989,7 @@ static ssize_t
989event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 989event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
990 loff_t *ppos) 990 loff_t *ppos)
991{ 991{
992 struct ftrace_event_call *call; 992 struct ftrace_event_file *file;
993 struct trace_seq *s; 993 struct trace_seq *s;
994 int r = -ENODEV; 994 int r = -ENODEV;
995 995
@@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1004 trace_seq_init(s); 1004 trace_seq_init(s);
1005 1005
1006 mutex_lock(&event_mutex); 1006 mutex_lock(&event_mutex);
1007 call = event_file_data(filp); 1007 file = event_file_data(filp);
1008 if (call) 1008 if (file)
1009 print_event_filter(call, s); 1009 print_event_filter(file, s);
1010 mutex_unlock(&event_mutex); 1010 mutex_unlock(&event_mutex);
1011 1011
1012 if (call) 1012 if (file)
1013 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1013 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
1014 1014
1015 kfree(s); 1015 kfree(s);
@@ -1021,7 +1021,7 @@ static ssize_t
1021event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1021event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
1022 loff_t *ppos) 1022 loff_t *ppos)
1023{ 1023{
1024 struct ftrace_event_call *call; 1024 struct ftrace_event_file *file;
1025 char *buf; 1025 char *buf;
1026 int err = -ENODEV; 1026 int err = -ENODEV;
1027 1027
@@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
1039 buf[cnt] = '\0'; 1039 buf[cnt] = '\0';
1040 1040
1041 mutex_lock(&event_mutex); 1041 mutex_lock(&event_mutex);
1042 call = event_file_data(filp); 1042 file = event_file_data(filp);
1043 if (call) 1043 if (file)
1044 err = apply_event_filter(call, buf); 1044 err = apply_event_filter(file, buf);
1045 mutex_unlock(&event_mutex); 1045 mutex_unlock(&event_mutex);
1046 1046
1047 free_page((unsigned long) buf); 1047 free_page((unsigned long) buf);
@@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1062 struct trace_array *tr; 1062 struct trace_array *tr;
1063 int ret; 1063 int ret;
1064 1064
1065 if (tracing_is_disabled())
1066 return -ENODEV;
1067
1065 /* Make sure the system still exists */ 1068 /* Make sure the system still exists */
1066 mutex_lock(&trace_types_lock); 1069 mutex_lock(&trace_types_lock);
1067 mutex_lock(&event_mutex); 1070 mutex_lock(&event_mutex);
@@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1108 struct trace_array *tr = inode->i_private; 1111 struct trace_array *tr = inode->i_private;
1109 int ret; 1112 int ret;
1110 1113
1114 if (tracing_is_disabled())
1115 return -ENODEV;
1116
1111 if (trace_array_get(tr) < 0) 1117 if (trace_array_get(tr) < 0)
1112 return -ENODEV; 1118 return -ENODEV;
1113 1119
@@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1124 if (ret < 0) { 1130 if (ret < 0) {
1125 trace_array_put(tr); 1131 trace_array_put(tr);
1126 kfree(dir); 1132 kfree(dir);
1133 return ret;
1127 } 1134 }
1128 1135
1129 filp->private_data = dir; 1136 filp->private_data = dir;
1130 1137
1131 return ret; 1138 return 0;
1132} 1139}
1133 1140
1134static int subsystem_release(struct inode *inode, struct file *file) 1141static int subsystem_release(struct inode *inode, struct file *file)
@@ -1489,12 +1496,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1489} 1496}
1490 1497
1491static int 1498static int
1492event_create_dir(struct dentry *parent, 1499event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1493 struct ftrace_event_file *file,
1494 const struct file_operations *id,
1495 const struct file_operations *enable,
1496 const struct file_operations *filter,
1497 const struct file_operations *format)
1498{ 1500{
1499 struct ftrace_event_call *call = file->event_call; 1501 struct ftrace_event_call *call = file->event_call;
1500 struct trace_array *tr = file->tr; 1502 struct trace_array *tr = file->tr;
@@ -1522,12 +1524,13 @@ event_create_dir(struct dentry *parent,
1522 1524
1523 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1525 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1524 trace_create_file("enable", 0644, file->dir, file, 1526 trace_create_file("enable", 0644, file->dir, file,
1525 enable); 1527 &ftrace_enable_fops);
1526 1528
1527#ifdef CONFIG_PERF_EVENTS 1529#ifdef CONFIG_PERF_EVENTS
1528 if (call->event.type && call->class->reg) 1530 if (call->event.type && call->class->reg)
1529 trace_create_file("id", 0444, file->dir, 1531 trace_create_file("id", 0444, file->dir,
1530 (void *)(long)call->event.type, id); 1532 (void *)(long)call->event.type,
1533 &ftrace_event_id_fops);
1531#endif 1534#endif
1532 1535
1533 /* 1536 /*
@@ -1543,11 +1546,11 @@ event_create_dir(struct dentry *parent,
1543 return -1; 1546 return -1;
1544 } 1547 }
1545 } 1548 }
1546 trace_create_file("filter", 0644, file->dir, call, 1549 trace_create_file("filter", 0644, file->dir, file,
1547 filter); 1550 &ftrace_event_filter_fops);
1548 1551
1549 trace_create_file("format", 0444, file->dir, call, 1552 trace_create_file("format", 0444, file->dir, call,
1550 format); 1553 &ftrace_event_format_fops);
1551 1554
1552 return 0; 1555 return 0;
1553} 1556}
@@ -1581,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call)
1581 if (file->event_call != call) 1584 if (file->event_call != call)
1582 continue; 1585 continue;
1583 ftrace_event_enable_disable(file, 0); 1586 ftrace_event_enable_disable(file, 0);
1587 destroy_preds(file);
1584 /* 1588 /*
1585 * The do_for_each_event_file() is 1589 * The do_for_each_event_file() is
1586 * a double loop. After finding the call for this 1590 * a double loop. After finding the call for this
@@ -1648,12 +1652,7 @@ trace_create_new_event(struct ftrace_event_call *call,
1648 1652
1649/* Add an event to a trace directory */ 1653/* Add an event to a trace directory */
1650static int 1654static int
1651__trace_add_new_event(struct ftrace_event_call *call, 1655__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
1652 struct trace_array *tr,
1653 const struct file_operations *id,
1654 const struct file_operations *enable,
1655 const struct file_operations *filter,
1656 const struct file_operations *format)
1657{ 1656{
1658 struct ftrace_event_file *file; 1657 struct ftrace_event_file *file;
1659 1658
@@ -1661,7 +1660,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
1661 if (!file) 1660 if (!file)
1662 return -ENOMEM; 1661 return -ENOMEM;
1663 1662
1664 return event_create_dir(tr->event_dir, file, id, enable, filter, format); 1663 return event_create_dir(tr->event_dir, file);
1665} 1664}
1666 1665
1667/* 1666/*
@@ -1683,8 +1682,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
1683} 1682}
1684 1683
1685struct ftrace_module_file_ops; 1684struct ftrace_module_file_ops;
1686static void __add_event_to_tracers(struct ftrace_event_call *call, 1685static void __add_event_to_tracers(struct ftrace_event_call *call);
1687 struct ftrace_module_file_ops *file_ops);
1688 1686
1689/* Add an additional event_call dynamically */ 1687/* Add an additional event_call dynamically */
1690int trace_add_event_call(struct ftrace_event_call *call) 1688int trace_add_event_call(struct ftrace_event_call *call)
@@ -1695,7 +1693,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
1695 1693
1696 ret = __register_event(call, NULL); 1694 ret = __register_event(call, NULL);
1697 if (ret >= 0) 1695 if (ret >= 0)
1698 __add_event_to_tracers(call, NULL); 1696 __add_event_to_tracers(call);
1699 1697
1700 mutex_unlock(&event_mutex); 1698 mutex_unlock(&event_mutex);
1701 mutex_unlock(&trace_types_lock); 1699 mutex_unlock(&trace_types_lock);
@@ -1710,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1710{ 1708{
1711 event_remove(call); 1709 event_remove(call);
1712 trace_destroy_fields(call); 1710 trace_destroy_fields(call);
1713 destroy_preds(call); 1711 destroy_call_preds(call);
1714} 1712}
1715 1713
1716static int probe_remove_event_call(struct ftrace_event_call *call) 1714static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1769,100 +1767,21 @@ int trace_remove_event_call(struct ftrace_event_call *call)
1769 1767
1770#ifdef CONFIG_MODULES 1768#ifdef CONFIG_MODULES
1771 1769
1772static LIST_HEAD(ftrace_module_file_list);
1773
1774/*
1775 * Modules must own their file_operations to keep up with
1776 * reference counting.
1777 */
1778struct ftrace_module_file_ops {
1779 struct list_head list;
1780 struct module *mod;
1781 struct file_operations id;
1782 struct file_operations enable;
1783 struct file_operations format;
1784 struct file_operations filter;
1785};
1786
1787static struct ftrace_module_file_ops *
1788find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1789{
1790 /*
1791 * As event_calls are added in groups by module,
1792 * when we find one file_ops, we don't need to search for
1793 * each call in that module, as the rest should be the
1794 * same. Only search for a new one if the last one did
1795 * not match.
1796 */
1797 if (file_ops && mod == file_ops->mod)
1798 return file_ops;
1799
1800 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1801 if (file_ops->mod == mod)
1802 return file_ops;
1803 }
1804 return NULL;
1805}
1806
1807static struct ftrace_module_file_ops *
1808trace_create_file_ops(struct module *mod)
1809{
1810 struct ftrace_module_file_ops *file_ops;
1811
1812 /*
1813 * This is a bit of a PITA. To allow for correct reference
1814 * counting, modules must "own" their file_operations.
1815 * To do this, we allocate the file operations that will be
1816 * used in the event directory.
1817 */
1818
1819 file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
1820 if (!file_ops)
1821 return NULL;
1822
1823 file_ops->mod = mod;
1824
1825 file_ops->id = ftrace_event_id_fops;
1826 file_ops->id.owner = mod;
1827
1828 file_ops->enable = ftrace_enable_fops;
1829 file_ops->enable.owner = mod;
1830
1831 file_ops->filter = ftrace_event_filter_fops;
1832 file_ops->filter.owner = mod;
1833
1834 file_ops->format = ftrace_event_format_fops;
1835 file_ops->format.owner = mod;
1836
1837 list_add(&file_ops->list, &ftrace_module_file_list);
1838
1839 return file_ops;
1840}
1841
1842static void trace_module_add_events(struct module *mod) 1770static void trace_module_add_events(struct module *mod)
1843{ 1771{
1844 struct ftrace_module_file_ops *file_ops = NULL;
1845 struct ftrace_event_call **call, **start, **end; 1772 struct ftrace_event_call **call, **start, **end;
1846 1773
1847 start = mod->trace_events; 1774 start = mod->trace_events;
1848 end = mod->trace_events + mod->num_trace_events; 1775 end = mod->trace_events + mod->num_trace_events;
1849 1776
1850 if (start == end)
1851 return;
1852
1853 file_ops = trace_create_file_ops(mod);
1854 if (!file_ops)
1855 return;
1856
1857 for_each_event(call, start, end) { 1777 for_each_event(call, start, end) {
1858 __register_event(*call, mod); 1778 __register_event(*call, mod);
1859 __add_event_to_tracers(*call, file_ops); 1779 __add_event_to_tracers(*call);
1860 } 1780 }
1861} 1781}
1862 1782
1863static void trace_module_remove_events(struct module *mod) 1783static void trace_module_remove_events(struct module *mod)
1864{ 1784{
1865 struct ftrace_module_file_ops *file_ops;
1866 struct ftrace_event_call *call, *p; 1785 struct ftrace_event_call *call, *p;
1867 bool clear_trace = false; 1786 bool clear_trace = false;
1868 1787
@@ -1874,16 +1793,6 @@ static void trace_module_remove_events(struct module *mod)
1874 __trace_remove_event_call(call); 1793 __trace_remove_event_call(call);
1875 } 1794 }
1876 } 1795 }
1877
1878 /* Now free the file_operations */
1879 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1880 if (file_ops->mod == mod)
1881 break;
1882 }
1883 if (&file_ops->list != &ftrace_module_file_list) {
1884 list_del(&file_ops->list);
1885 kfree(file_ops);
1886 }
1887 up_write(&trace_event_sem); 1796 up_write(&trace_event_sem);
1888 1797
1889 /* 1798 /*
@@ -1919,67 +1828,21 @@ static int trace_module_notify(struct notifier_block *self,
1919 return 0; 1828 return 0;
1920} 1829}
1921 1830
1922static int 1831static struct notifier_block trace_module_nb = {
1923__trace_add_new_mod_event(struct ftrace_event_call *call, 1832 .notifier_call = trace_module_notify,
1924 struct trace_array *tr, 1833 .priority = 0,
1925 struct ftrace_module_file_ops *file_ops) 1834};
1926{
1927 return __trace_add_new_event(call, tr,
1928 &file_ops->id, &file_ops->enable,
1929 &file_ops->filter, &file_ops->format);
1930}
1931
1932#else
1933static inline struct ftrace_module_file_ops *
1934find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1935{
1936 return NULL;
1937}
1938static inline int trace_module_notify(struct notifier_block *self,
1939 unsigned long val, void *data)
1940{
1941 return 0;
1942}
1943static inline int
1944__trace_add_new_mod_event(struct ftrace_event_call *call,
1945 struct trace_array *tr,
1946 struct ftrace_module_file_ops *file_ops)
1947{
1948 return -ENODEV;
1949}
1950#endif /* CONFIG_MODULES */ 1835#endif /* CONFIG_MODULES */
1951 1836
1952/* Create a new event directory structure for a trace directory. */ 1837/* Create a new event directory structure for a trace directory. */
1953static void 1838static void
1954__trace_add_event_dirs(struct trace_array *tr) 1839__trace_add_event_dirs(struct trace_array *tr)
1955{ 1840{
1956 struct ftrace_module_file_ops *file_ops = NULL;
1957 struct ftrace_event_call *call; 1841 struct ftrace_event_call *call;
1958 int ret; 1842 int ret;
1959 1843
1960 list_for_each_entry(call, &ftrace_events, list) { 1844 list_for_each_entry(call, &ftrace_events, list) {
1961 if (call->mod) { 1845 ret = __trace_add_new_event(call, tr);
1962 /*
1963 * Directories for events by modules need to
1964 * keep module ref counts when opened (as we don't
1965 * want the module to disappear when reading one
1966 * of these files). The file_ops keep account of
1967 * the module ref count.
1968 */
1969 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1970 if (!file_ops)
1971 continue; /* Warn? */
1972 ret = __trace_add_new_mod_event(call, tr, file_ops);
1973 if (ret < 0)
1974 pr_warning("Could not create directory for event %s\n",
1975 call->name);
1976 continue;
1977 }
1978 ret = __trace_add_new_event(call, tr,
1979 &ftrace_event_id_fops,
1980 &ftrace_enable_fops,
1981 &ftrace_event_filter_fops,
1982 &ftrace_event_format_fops);
1983 if (ret < 0) 1846 if (ret < 0)
1984 pr_warning("Could not create directory for event %s\n", 1847 pr_warning("Could not create directory for event %s\n",
1985 call->name); 1848 call->name);
@@ -2287,11 +2150,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
2287 2150
2288 2151
2289 list_for_each_entry(file, &tr->events, list) { 2152 list_for_each_entry(file, &tr->events, list) {
2290 ret = event_create_dir(tr->event_dir, file, 2153 ret = event_create_dir(tr->event_dir, file);
2291 &ftrace_event_id_fops,
2292 &ftrace_enable_fops,
2293 &ftrace_event_filter_fops,
2294 &ftrace_event_format_fops);
2295 if (ret < 0) 2154 if (ret < 0)
2296 pr_warning("Could not create directory for event %s\n", 2155 pr_warning("Could not create directory for event %s\n",
2297 file->event_call->name); 2156 file->event_call->name);
@@ -2332,29 +2191,14 @@ __trace_remove_event_dirs(struct trace_array *tr)
2332 remove_event_file_dir(file); 2191 remove_event_file_dir(file);
2333} 2192}
2334 2193
2335static void 2194static void __add_event_to_tracers(struct ftrace_event_call *call)
2336__add_event_to_tracers(struct ftrace_event_call *call,
2337 struct ftrace_module_file_ops *file_ops)
2338{ 2195{
2339 struct trace_array *tr; 2196 struct trace_array *tr;
2340 2197
2341 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 2198 list_for_each_entry(tr, &ftrace_trace_arrays, list)
2342 if (file_ops) 2199 __trace_add_new_event(call, tr);
2343 __trace_add_new_mod_event(call, tr, file_ops);
2344 else
2345 __trace_add_new_event(call, tr,
2346 &ftrace_event_id_fops,
2347 &ftrace_enable_fops,
2348 &ftrace_event_filter_fops,
2349 &ftrace_event_format_fops);
2350 }
2351} 2200}
2352 2201
2353static struct notifier_block trace_module_nb = {
2354 .notifier_call = trace_module_notify,
2355 .priority = 0,
2356};
2357
2358extern struct ftrace_event_call *__start_ftrace_events[]; 2202extern struct ftrace_event_call *__start_ftrace_events[];
2359extern struct ftrace_event_call *__stop_ftrace_events[]; 2203extern struct ftrace_event_call *__stop_ftrace_events[];
2360 2204
@@ -2559,10 +2403,11 @@ static __init int event_trace_init(void)
2559 if (ret) 2403 if (ret)
2560 return ret; 2404 return ret;
2561 2405
2406#ifdef CONFIG_MODULES
2562 ret = register_module_notifier(&trace_module_nb); 2407 ret = register_module_notifier(&trace_module_nb);
2563 if (ret) 2408 if (ret)
2564 pr_warning("Failed to register trace events module notifier\n"); 2409 pr_warning("Failed to register trace events module notifier\n");
2565 2410#endif
2566 return 0; 2411 return 0;
2567} 2412}
2568early_initcall(event_trace_memsetup); 2413early_initcall(event_trace_memsetup);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 97daa8cf958d..2468f56dc5db 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps,
637 free_page((unsigned long) buf); 637 free_page((unsigned long) buf);
638} 638}
639 639
640static inline struct event_filter *event_filter(struct ftrace_event_file *file)
641{
642 if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
643 return file->event_call->filter;
644 else
645 return file->filter;
646}
647
640/* caller must hold event_mutex */ 648/* caller must hold event_mutex */
641void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 649void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
642{ 650{
643 struct event_filter *filter = call->filter; 651 struct event_filter *filter = event_filter(file);
644 652
645 if (filter && filter->filter_string) 653 if (filter && filter->filter_string)
646 trace_seq_printf(s, "%s\n", filter->filter_string); 654 trace_seq_printf(s, "%s\n", filter->filter_string);
@@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter)
766 filter->n_preds = 0; 774 filter->n_preds = 0;
767} 775}
768 776
769static void filter_disable(struct ftrace_event_call *call) 777static void call_filter_disable(struct ftrace_event_call *call)
770{ 778{
771 call->flags &= ~TRACE_EVENT_FL_FILTERED; 779 call->flags &= ~TRACE_EVENT_FL_FILTERED;
772} 780}
773 781
782static void filter_disable(struct ftrace_event_file *file)
783{
784 struct ftrace_event_call *call = file->event_call;
785
786 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
787 call_filter_disable(call);
788 else
789 file->flags &= ~FTRACE_EVENT_FL_FILTERED;
790}
791
774static void __free_filter(struct event_filter *filter) 792static void __free_filter(struct event_filter *filter)
775{ 793{
776 if (!filter) 794 if (!filter)
@@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter)
781 kfree(filter); 799 kfree(filter);
782} 800}
783 801
802void destroy_call_preds(struct ftrace_event_call *call)
803{
804 __free_filter(call->filter);
805 call->filter = NULL;
806}
807
808static void destroy_file_preds(struct ftrace_event_file *file)
809{
810 __free_filter(file->filter);
811 file->filter = NULL;
812}
813
784/* 814/*
785 * Called when destroying the ftrace_event_call. 815 * Called when destroying the ftrace_event_file.
786 * The call is being freed, so we do not need to worry about 816 * The file is being freed, so we do not need to worry about
787 * the call being currently used. This is for module code removing 817 * the file being currently used. This is for module code removing
788 * the tracepoints from within it. 818 * the tracepoints from within it.
789 */ 819 */
790void destroy_preds(struct ftrace_event_call *call) 820void destroy_preds(struct ftrace_event_file *file)
791{ 821{
792 __free_filter(call->filter); 822 if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
793 call->filter = NULL; 823 destroy_call_preds(file->event_call);
824 else
825 destroy_file_preds(file);
794} 826}
795 827
796static struct event_filter *__alloc_filter(void) 828static struct event_filter *__alloc_filter(void)
@@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
825 return 0; 857 return 0;
826} 858}
827 859
828static void filter_free_subsystem_preds(struct event_subsystem *system) 860static inline void __remove_filter(struct ftrace_event_file *file)
829{ 861{
862 struct ftrace_event_call *call = file->event_call;
863
864 filter_disable(file);
865 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
866 remove_filter_string(call->filter);
867 else
868 remove_filter_string(file->filter);
869}
870
871static void filter_free_subsystem_preds(struct event_subsystem *system,
872 struct trace_array *tr)
873{
874 struct ftrace_event_file *file;
830 struct ftrace_event_call *call; 875 struct ftrace_event_call *call;
831 876
832 list_for_each_entry(call, &ftrace_events, list) { 877 list_for_each_entry(file, &tr->events, list) {
878 call = file->event_call;
833 if (strcmp(call->class->system, system->name) != 0) 879 if (strcmp(call->class->system, system->name) != 0)
834 continue; 880 continue;
835 881
836 filter_disable(call); 882 __remove_filter(file);
837 remove_filter_string(call->filter);
838 } 883 }
839} 884}
840 885
841static void filter_free_subsystem_filters(struct event_subsystem *system) 886static inline void __free_subsystem_filter(struct ftrace_event_file *file)
842{ 887{
888 struct ftrace_event_call *call = file->event_call;
889
890 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
891 __free_filter(call->filter);
892 call->filter = NULL;
893 } else {
894 __free_filter(file->filter);
895 file->filter = NULL;
896 }
897}
898
899static void filter_free_subsystem_filters(struct event_subsystem *system,
900 struct trace_array *tr)
901{
902 struct ftrace_event_file *file;
843 struct ftrace_event_call *call; 903 struct ftrace_event_call *call;
844 904
845 list_for_each_entry(call, &ftrace_events, list) { 905 list_for_each_entry(file, &tr->events, list) {
906 call = file->event_call;
846 if (strcmp(call->class->system, system->name) != 0) 907 if (strcmp(call->class->system, system->name) != 0)
847 continue; 908 continue;
848 __free_filter(call->filter); 909 __free_subsystem_filter(file);
849 call->filter = NULL;
850 } 910 }
851} 911}
852 912
@@ -1617,15 +1677,85 @@ fail:
1617 return err; 1677 return err;
1618} 1678}
1619 1679
1680static inline void event_set_filtered_flag(struct ftrace_event_file *file)
1681{
1682 struct ftrace_event_call *call = file->event_call;
1683
1684 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1685 call->flags |= TRACE_EVENT_FL_FILTERED;
1686 else
1687 file->flags |= FTRACE_EVENT_FL_FILTERED;
1688}
1689
1690static inline void event_set_filter(struct ftrace_event_file *file,
1691 struct event_filter *filter)
1692{
1693 struct ftrace_event_call *call = file->event_call;
1694
1695 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1696 rcu_assign_pointer(call->filter, filter);
1697 else
1698 rcu_assign_pointer(file->filter, filter);
1699}
1700
1701static inline void event_clear_filter(struct ftrace_event_file *file)
1702{
1703 struct ftrace_event_call *call = file->event_call;
1704
1705 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1706 RCU_INIT_POINTER(call->filter, NULL);
1707 else
1708 RCU_INIT_POINTER(file->filter, NULL);
1709}
1710
1711static inline void
1712event_set_no_set_filter_flag(struct ftrace_event_file *file)
1713{
1714 struct ftrace_event_call *call = file->event_call;
1715
1716 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1717 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
1718 else
1719 file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
1720}
1721
1722static inline void
1723event_clear_no_set_filter_flag(struct ftrace_event_file *file)
1724{
1725 struct ftrace_event_call *call = file->event_call;
1726
1727 if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
1728 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
1729 else
1730 file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
1731}
1732
1733static inline bool
1734event_no_set_filter_flag(struct ftrace_event_file *file)
1735{
1736 struct ftrace_event_call *call = file->event_call;
1737
1738 if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
1739 return true;
1740
1741 if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
1742 (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
1743 return true;
1744
1745 return false;
1746}
1747
1620struct filter_list { 1748struct filter_list {
1621 struct list_head list; 1749 struct list_head list;
1622 struct event_filter *filter; 1750 struct event_filter *filter;
1623}; 1751};
1624 1752
1625static int replace_system_preds(struct event_subsystem *system, 1753static int replace_system_preds(struct event_subsystem *system,
1754 struct trace_array *tr,
1626 struct filter_parse_state *ps, 1755 struct filter_parse_state *ps,
1627 char *filter_string) 1756 char *filter_string)
1628{ 1757{
1758 struct ftrace_event_file *file;
1629 struct ftrace_event_call *call; 1759 struct ftrace_event_call *call;
1630 struct filter_list *filter_item; 1760 struct filter_list *filter_item;
1631 struct filter_list *tmp; 1761 struct filter_list *tmp;
@@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system,
1633 bool fail = true; 1763 bool fail = true;
1634 int err; 1764 int err;
1635 1765
1636 list_for_each_entry(call, &ftrace_events, list) { 1766 list_for_each_entry(file, &tr->events, list) {
1637 1767 call = file->event_call;
1638 if (strcmp(call->class->system, system->name) != 0) 1768 if (strcmp(call->class->system, system->name) != 0)
1639 continue; 1769 continue;
1640 1770
@@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system,
1644 */ 1774 */
1645 err = replace_preds(call, NULL, ps, filter_string, true); 1775 err = replace_preds(call, NULL, ps, filter_string, true);
1646 if (err) 1776 if (err)
1647 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; 1777 event_set_no_set_filter_flag(file);
1648 else 1778 else
1649 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; 1779 event_clear_no_set_filter_flag(file);
1650 } 1780 }
1651 1781
1652 list_for_each_entry(call, &ftrace_events, list) { 1782 list_for_each_entry(file, &tr->events, list) {
1653 struct event_filter *filter; 1783 struct event_filter *filter;
1654 1784
1785 call = file->event_call;
1786
1655 if (strcmp(call->class->system, system->name) != 0) 1787 if (strcmp(call->class->system, system->name) != 0)
1656 continue; 1788 continue;
1657 1789
1658 if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) 1790 if (event_no_set_filter_flag(file))
1659 continue; 1791 continue;
1660 1792
1661 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); 1793 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
@@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system,
1676 1808
1677 err = replace_preds(call, filter, ps, filter_string, false); 1809 err = replace_preds(call, filter, ps, filter_string, false);
1678 if (err) { 1810 if (err) {
1679 filter_disable(call); 1811 filter_disable(file);
1680 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1812 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1681 append_filter_err(ps, filter); 1813 append_filter_err(ps, filter);
1682 } else 1814 } else
1683 call->flags |= TRACE_EVENT_FL_FILTERED; 1815 event_set_filtered_flag(file);
1684 /* 1816 /*
1685 * Regardless of if this returned an error, we still 1817 * Regardless of if this returned an error, we still
1686 * replace the filter for the call. 1818 * replace the filter for the call.
1687 */ 1819 */
1688 filter = call->filter; 1820 filter = event_filter(file);
1689 rcu_assign_pointer(call->filter, filter_item->filter); 1821 event_set_filter(file, filter_item->filter);
1690 filter_item->filter = filter; 1822 filter_item->filter = filter;
1691 1823
1692 fail = false; 1824 fail = false;
@@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call,
1816 * and always remembers @filter_str. 1948 * and always remembers @filter_str.
1817 */ 1949 */
1818static int create_system_filter(struct event_subsystem *system, 1950static int create_system_filter(struct event_subsystem *system,
1951 struct trace_array *tr,
1819 char *filter_str, struct event_filter **filterp) 1952 char *filter_str, struct event_filter **filterp)
1820{ 1953{
1821 struct event_filter *filter = NULL; 1954 struct event_filter *filter = NULL;
@@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system,
1824 1957
1825 err = create_filter_start(filter_str, true, &ps, &filter); 1958 err = create_filter_start(filter_str, true, &ps, &filter);
1826 if (!err) { 1959 if (!err) {
1827 err = replace_system_preds(system, ps, filter_str); 1960 err = replace_system_preds(system, tr, ps, filter_str);
1828 if (!err) { 1961 if (!err) {
1829 /* System filters just show a default message */ 1962 /* System filters just show a default message */
1830 kfree(filter->filter_string); 1963 kfree(filter->filter_string);
@@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system,
1840} 1973}
1841 1974
1842/* caller must hold event_mutex */ 1975/* caller must hold event_mutex */
1843int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1976int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
1844{ 1977{
1978 struct ftrace_event_call *call = file->event_call;
1845 struct event_filter *filter; 1979 struct event_filter *filter;
1846 int err; 1980 int err;
1847 1981
1848 if (!strcmp(strstrip(filter_string), "0")) { 1982 if (!strcmp(strstrip(filter_string), "0")) {
1849 filter_disable(call); 1983 filter_disable(file);
1850 filter = call->filter; 1984 filter = event_filter(file);
1985
1851 if (!filter) 1986 if (!filter)
1852 return 0; 1987 return 0;
1853 RCU_INIT_POINTER(call->filter, NULL); 1988
1989 event_clear_filter(file);
1990
1854 /* Make sure the filter is not being used */ 1991 /* Make sure the filter is not being used */
1855 synchronize_sched(); 1992 synchronize_sched();
1856 __free_filter(filter); 1993 __free_filter(filter);
1994
1857 return 0; 1995 return 0;
1858 } 1996 }
1859 1997
@@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1866 * string 2004 * string
1867 */ 2005 */
1868 if (filter) { 2006 if (filter) {
1869 struct event_filter *tmp = call->filter; 2007 struct event_filter *tmp;
1870 2008
2009 tmp = event_filter(file);
1871 if (!err) 2010 if (!err)
1872 call->flags |= TRACE_EVENT_FL_FILTERED; 2011 event_set_filtered_flag(file);
1873 else 2012 else
1874 filter_disable(call); 2013 filter_disable(file);
1875 2014
1876 rcu_assign_pointer(call->filter, filter); 2015 event_set_filter(file, filter);
1877 2016
1878 if (tmp) { 2017 if (tmp) {
1879 /* Make sure the call is done with the filter */ 2018 /* Make sure the call is done with the filter */
@@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1889 char *filter_string) 2028 char *filter_string)
1890{ 2029{
1891 struct event_subsystem *system = dir->subsystem; 2030 struct event_subsystem *system = dir->subsystem;
2031 struct trace_array *tr = dir->tr;
1892 struct event_filter *filter; 2032 struct event_filter *filter;
1893 int err = 0; 2033 int err = 0;
1894 2034
@@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1901 } 2041 }
1902 2042
1903 if (!strcmp(strstrip(filter_string), "0")) { 2043 if (!strcmp(strstrip(filter_string), "0")) {
1904 filter_free_subsystem_preds(system); 2044 filter_free_subsystem_preds(system, tr);
1905 remove_filter_string(system->filter); 2045 remove_filter_string(system->filter);
1906 filter = system->filter; 2046 filter = system->filter;
1907 system->filter = NULL; 2047 system->filter = NULL;
1908 /* Ensure all filters are no longer used */ 2048 /* Ensure all filters are no longer used */
1909 synchronize_sched(); 2049 synchronize_sched();
1910 filter_free_subsystem_filters(system); 2050 filter_free_subsystem_filters(system, tr);
1911 __free_filter(filter); 2051 __free_filter(filter);
1912 goto out_unlock; 2052 goto out_unlock;
1913 } 2053 }
1914 2054
1915 err = create_system_filter(system, filter_string, &filter); 2055 err = create_system_filter(system, tr, filter_string, &filter);
1916 if (filter) { 2056 if (filter) {
1917 /* 2057 /*
1918 * No event actually uses the system filter 2058 * No event actually uses the system filter
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d21a74670088..7c3e3e72e2b6 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 180 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 181 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 182 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ 183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
184}; \ 184}; \
185struct ftrace_event_call __used \ 185struct ftrace_event_call __used \
186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b5c09242683d..0b99120d395c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -82,9 +82,9 @@ static struct trace_array *graph_array;
82 * to fill in space into DURATION column. 82 * to fill in space into DURATION column.
83 */ 83 */
84enum { 84enum {
85 DURATION_FILL_FULL = -1, 85 FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT,
86 DURATION_FILL_START = -2, 86 FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT,
87 DURATION_FILL_END = -3, 87 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
88}; 88};
89 89
90static enum print_line_t 90static enum print_line_t
@@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
114 return -EBUSY; 114 return -EBUSY;
115 } 115 }
116 116
117 /*
118 * The curr_ret_stack is an index to ftrace return stack of
119 * current task. Its value should be in [0, FTRACE_RETFUNC_
120 * DEPTH) when the function graph tracer is used. To support
121 * filtering out specific functions, it makes the index
122 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
123 * so when it sees a negative index the ftrace will ignore
124 * the record. And the index gets recovered when returning
125 * from the filtered function by adding the FTRACE_NOTRACE_
126 * DEPTH and then it'll continue to record functions normally.
127 *
128 * The curr_ret_stack is initialized to -1 and get increased
129 * in this function. So it can be less than -1 only if it was
130 * filtered out via ftrace_graph_notrace_addr() which can be
131 * set from set_graph_notrace file in debugfs by user.
132 */
133 if (current->curr_ret_stack < -1)
134 return -EBUSY;
135
117 calltime = trace_clock_local(); 136 calltime = trace_clock_local();
118 137
119 index = ++current->curr_ret_stack; 138 index = ++current->curr_ret_stack;
139 if (ftrace_graph_notrace_addr(func))
140 current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
120 barrier(); 141 barrier();
121 current->ret_stack[index].ret = ret; 142 current->ret_stack[index].ret = ret;
122 current->ret_stack[index].func = func; 143 current->ret_stack[index].func = func;
123 current->ret_stack[index].calltime = calltime; 144 current->ret_stack[index].calltime = calltime;
124 current->ret_stack[index].subtime = 0; 145 current->ret_stack[index].subtime = 0;
125 current->ret_stack[index].fp = frame_pointer; 146 current->ret_stack[index].fp = frame_pointer;
126 *depth = index; 147 *depth = current->curr_ret_stack;
127 148
128 return 0; 149 return 0;
129} 150}
@@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
137 158
138 index = current->curr_ret_stack; 159 index = current->curr_ret_stack;
139 160
140 if (unlikely(index < 0)) { 161 /*
162 * A negative index here means that it's just returned from a
163 * notrace'd function. Recover index to get an original
164 * return address. See ftrace_push_return_trace().
165 *
166 * TODO: Need to check whether the stack gets corrupted.
167 */
168 if (index < 0)
169 index += FTRACE_NOTRACE_DEPTH;
170
171 if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
141 ftrace_graph_stop(); 172 ftrace_graph_stop();
142 WARN_ON(1); 173 WARN_ON(1);
143 /* Might as well panic, otherwise we have no where to go */ 174 /* Might as well panic, otherwise we have no where to go */
@@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
193 trace.rettime = trace_clock_local(); 224 trace.rettime = trace_clock_local();
194 barrier(); 225 barrier();
195 current->curr_ret_stack--; 226 current->curr_ret_stack--;
227 /*
228 * The curr_ret_stack can be less than -1 only if it was
229 * filtered out and it's about to return from the function.
230 * Recover the index and continue to trace normal functions.
231 */
232 if (current->curr_ret_stack < -1) {
233 current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
234 return ret;
235 }
196 236
197 /* 237 /*
198 * The trace should run after decrementing the ret counter 238 * The trace should run after decrementing the ret counter
@@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr,
230 return 0; 270 return 0;
231 entry = ring_buffer_event_data(event); 271 entry = ring_buffer_event_data(event);
232 entry->graph_ent = *trace; 272 entry->graph_ent = *trace;
233 if (!filter_current_check_discard(buffer, call, entry, event)) 273 if (!call_filter_check_discard(call, entry, buffer, event))
234 __buffer_unlock_commit(buffer, event); 274 __buffer_unlock_commit(buffer, event);
235 275
236 return 1; 276 return 1;
@@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
259 299
260 /* trace it when it is-nested-in or is a function enabled. */ 300 /* trace it when it is-nested-in or is a function enabled. */
261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) || 301 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
262 ftrace_graph_ignore_irqs()) || 302 ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
263 (max_depth && trace->depth >= max_depth)) 303 (max_depth && trace->depth >= max_depth))
264 return 0; 304 return 0;
265 305
306 /*
307 * Do not trace a function if it's filtered by set_graph_notrace.
308 * Make the index of ret stack negative to indicate that it should
309 * ignore further functions. But it needs its own ret stack entry
310 * to recover the original index in order to continue tracing after
311 * returning from the function.
312 */
313 if (ftrace_graph_notrace_addr(trace->func))
314 return 1;
315
266 local_irq_save(flags); 316 local_irq_save(flags);
267 cpu = raw_smp_processor_id(); 317 cpu = raw_smp_processor_id();
268 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 318 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr,
335 return; 385 return;
336 entry = ring_buffer_event_data(event); 386 entry = ring_buffer_event_data(event);
337 entry->ret = *trace; 387 entry->ret = *trace;
338 if (!filter_current_check_discard(buffer, call, entry, event)) 388 if (!call_filter_check_discard(call, entry, buffer, event))
339 __buffer_unlock_commit(buffer, event); 389 __buffer_unlock_commit(buffer, event);
340} 390}
341 391
@@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
652 } 702 }
653 703
654 /* No overhead */ 704 /* No overhead */
655 ret = print_graph_duration(DURATION_FILL_START, s, flags); 705 ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
656 if (ret != TRACE_TYPE_HANDLED) 706 if (ret != TRACE_TYPE_HANDLED)
657 return ret; 707 return ret;
658 708
@@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
664 if (!ret) 714 if (!ret)
665 return TRACE_TYPE_PARTIAL_LINE; 715 return TRACE_TYPE_PARTIAL_LINE;
666 716
667 ret = print_graph_duration(DURATION_FILL_END, s, flags); 717 ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
668 if (ret != TRACE_TYPE_HANDLED) 718 if (ret != TRACE_TYPE_HANDLED)
669 return ret; 719 return ret;
670 720
@@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
729 return TRACE_TYPE_HANDLED; 779 return TRACE_TYPE_HANDLED;
730 780
731 /* No real adata, just filling the column with spaces */ 781 /* No real adata, just filling the column with spaces */
732 switch (duration) { 782 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
733 case DURATION_FILL_FULL: 783 case FLAGS_FILL_FULL:
734 ret = trace_seq_puts(s, " | "); 784 ret = trace_seq_puts(s, " | ");
735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 785 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
736 case DURATION_FILL_START: 786 case FLAGS_FILL_START:
737 ret = trace_seq_puts(s, " "); 787 ret = trace_seq_puts(s, " ");
738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 788 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
739 case DURATION_FILL_END: 789 case FLAGS_FILL_END:
740 ret = trace_seq_puts(s, " |"); 790 ret = trace_seq_puts(s, " |");
741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 791 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
742 } 792 }
@@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
852 } 902 }
853 903
854 /* No time */ 904 /* No time */
855 ret = print_graph_duration(DURATION_FILL_FULL, s, flags); 905 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
856 if (ret != TRACE_TYPE_HANDLED) 906 if (ret != TRACE_TYPE_HANDLED)
857 return ret; 907 return ret;
858 908
@@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1172 return TRACE_TYPE_PARTIAL_LINE; 1222 return TRACE_TYPE_PARTIAL_LINE;
1173 1223
1174 /* No time */ 1224 /* No time */
1175 ret = print_graph_duration(DURATION_FILL_FULL, s, flags); 1225 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
1176 if (ret != TRACE_TYPE_HANDLED) 1226 if (ret != TRACE_TYPE_HANDLED)
1177 return ret; 1227 return ret;
1178 1228
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 243f6834d026..dae9541ada9e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
835 entry->ip = (unsigned long)tp->rp.kp.addr; 835 entry->ip = (unsigned long)tp->rp.kp.addr;
836 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 836 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
837 837
838 if (!filter_current_check_discard(buffer, call, entry, event)) 838 if (!filter_check_discard(ftrace_file, entry, buffer, event))
839 trace_buffer_unlock_commit_regs(buffer, event, 839 trace_buffer_unlock_commit_regs(buffer, event,
840 irq_flags, pc, regs); 840 irq_flags, pc, regs);
841} 841}
@@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
884 entry->ret_ip = (unsigned long)ri->ret_addr; 884 entry->ret_ip = (unsigned long)ri->ret_addr;
885 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 885 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
886 886
887 if (!filter_current_check_discard(buffer, call, entry, event)) 887 if (!filter_check_discard(ftrace_file, entry, buffer, event))
888 trace_buffer_unlock_commit_regs(buffer, event, 888 trace_buffer_unlock_commit_regs(buffer, event,
889 irq_flags, pc, regs); 889 irq_flags, pc, regs);
890} 890}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b3dcfb2f0fef..0abd9b863474 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
323 entry = ring_buffer_event_data(event); 323 entry = ring_buffer_event_data(event);
324 entry->rw = *rw; 324 entry->rw = *rw;
325 325
326 if (!filter_check_discard(call, entry, buffer, event)) 326 if (!call_filter_check_discard(call, entry, buffer, event))
327 trace_buffer_unlock_commit(buffer, event, 0, pc); 327 trace_buffer_unlock_commit(buffer, event, 0, pc);
328} 328}
329 329
@@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
353 entry = ring_buffer_event_data(event); 353 entry = ring_buffer_event_data(event);
354 entry->map = *map; 354 entry->map = *map;
355 355
356 if (!filter_check_discard(call, entry, buffer, event)) 356 if (!call_filter_check_discard(call, entry, buffer, event))
357 trace_buffer_unlock_commit(buffer, event, 0, pc); 357 trace_buffer_unlock_commit(buffer, event, 0, pc);
358} 358}
359 359
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cbac0c9c..ed32284fbe32 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : 619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
620 '.'; 620 '.';
621 need_resched = 621
622 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; 622 switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
623 TRACE_FLAG_PREEMPT_RESCHED)) {
624 case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
625 need_resched = 'N';
626 break;
627 case TRACE_FLAG_NEED_RESCHED:
628 need_resched = 'n';
629 break;
630 case TRACE_FLAG_PREEMPT_RESCHED:
631 need_resched = 'p';
632 break;
633 default:
634 need_resched = '.';
635 break;
636 }
637
623 hardsoft_irq = 638 hardsoft_irq =
624 (hardirq && softirq) ? 'H' : 639 (hardirq && softirq) ? 'H' :
625 hardirq ? 'h' : 640 hardirq ? 'h' :
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4e98e3b257a3..3f34dc9b40f3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
45 entry->next_state = next->state; 45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next); 46 entry->next_cpu = task_cpu(next);
47 47
48 if (!filter_check_discard(call, entry, buffer, event)) 48 if (!call_filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc); 49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50} 50}
51 51
@@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
101 entry->next_state = wakee->state; 101 entry->next_state = wakee->state;
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!call_filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc); 105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106} 106}
107 107
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 847f88a6194b..7af67360b330 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
43/* The root directory for all stat files */ 43/* The root directory for all stat files */
44static struct dentry *stat_dir; 44static struct dentry *stat_dir;
45 45
46/* 46static void __reset_stat_session(struct stat_session *session)
47 * Iterate through the rbtree using a post order traversal path
48 * to release the next node.
49 * It won't necessary release one at each iteration
50 * but it will at least advance closer to the next one
51 * to be released.
52 */
53static struct rb_node *release_next(struct tracer_stat *ts,
54 struct rb_node *node)
55{ 47{
56 struct stat_node *snode; 48 struct stat_node *snode, *n;
57 struct rb_node *parent = rb_parent(node);
58
59 if (node->rb_left)
60 return node->rb_left;
61 else if (node->rb_right)
62 return node->rb_right;
63 else {
64 if (!parent)
65 ;
66 else if (parent->rb_left == node)
67 parent->rb_left = NULL;
68 else
69 parent->rb_right = NULL;
70 49
71 snode = container_of(node, struct stat_node, node); 50 rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) {
72 if (ts->stat_release) 51 if (session->ts->stat_release)
73 ts->stat_release(snode->stat); 52 session->ts->stat_release(snode->stat);
74 kfree(snode); 53 kfree(snode);
75
76 return parent;
77 } 54 }
78}
79
80static void __reset_stat_session(struct stat_session *session)
81{
82 struct rb_node *node = session->stat_root.rb_node;
83
84 while (node)
85 node = release_next(session->ts, node);
86 55
87 session->stat_root = RB_ROOT; 56 session->stat_root = RB_ROOT;
88} 57}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8fd03657bc7d..e4b6d11bdf78 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -200,8 +200,8 @@ extern char *__bad_type_size(void);
200 #type, #name, offsetof(typeof(trace), name), \ 200 #type, #name, offsetof(typeof(trace), name), \
201 sizeof(trace.name), is_signed_type(type) 201 sizeof(trace.name), is_signed_type(type)
202 202
203static 203static int __init
204int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 204__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
205{ 205{
206 int i; 206 int i;
207 int pos = 0; 207 int pos = 0;
@@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
228 return pos; 228 return pos;
229} 229}
230 230
231static int set_syscall_print_fmt(struct ftrace_event_call *call) 231static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
232{ 232{
233 char *print_fmt; 233 char *print_fmt;
234 int len; 234 int len;
@@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call)
253 return 0; 253 return 0;
254} 254}
255 255
256static void free_syscall_print_fmt(struct ftrace_event_call *call) 256static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
257{ 257{
258 struct syscall_metadata *entry = call->data; 258 struct syscall_metadata *entry = call->data;
259 259
@@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) 302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
303{ 303{
304 struct trace_array *tr = data; 304 struct trace_array *tr = data;
305 struct ftrace_event_file *ftrace_file;
305 struct syscall_trace_enter *entry; 306 struct syscall_trace_enter *entry;
306 struct syscall_metadata *sys_data; 307 struct syscall_metadata *sys_data;
307 struct ring_buffer_event *event; 308 struct ring_buffer_event *event;
@@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
314 syscall_nr = trace_get_syscall_nr(current, regs); 315 syscall_nr = trace_get_syscall_nr(current, regs);
315 if (syscall_nr < 0) 316 if (syscall_nr < 0)
316 return; 317 return;
317 if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) 318
319 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
320 ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
321 if (!ftrace_file)
322 return;
323
324 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
318 return; 325 return;
319 326
320 sys_data = syscall_nr_to_meta(syscall_nr); 327 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
336 entry->nr = syscall_nr; 343 entry->nr = syscall_nr;
337 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 344 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
338 345
339 if (!filter_current_check_discard(buffer, sys_data->enter_event, 346 if (!filter_check_discard(ftrace_file, entry, buffer, event))
340 entry, event))
341 trace_current_buffer_unlock_commit(buffer, event, 347 trace_current_buffer_unlock_commit(buffer, event,
342 irq_flags, pc); 348 irq_flags, pc);
343} 349}
@@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
345static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 351static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
346{ 352{
347 struct trace_array *tr = data; 353 struct trace_array *tr = data;
354 struct ftrace_event_file *ftrace_file;
348 struct syscall_trace_exit *entry; 355 struct syscall_trace_exit *entry;
349 struct syscall_metadata *sys_data; 356 struct syscall_metadata *sys_data;
350 struct ring_buffer_event *event; 357 struct ring_buffer_event *event;
@@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
356 syscall_nr = trace_get_syscall_nr(current, regs); 363 syscall_nr = trace_get_syscall_nr(current, regs);
357 if (syscall_nr < 0) 364 if (syscall_nr < 0)
358 return; 365 return;
359 if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) 366
367 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
368 ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
369 if (!ftrace_file)
370 return;
371
372 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
360 return; 373 return;
361 374
362 sys_data = syscall_nr_to_meta(syscall_nr); 375 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
377 entry->nr = syscall_nr; 390 entry->nr = syscall_nr;
378 entry->ret = syscall_get_return_value(current, regs); 391 entry->ret = syscall_get_return_value(current, regs);
379 392
380 if (!filter_current_check_discard(buffer, sys_data->exit_event, 393 if (!filter_check_discard(ftrace_file, entry, buffer, event))
381 entry, event))
382 trace_current_buffer_unlock_commit(buffer, event, 394 trace_current_buffer_unlock_commit(buffer, event,
383 irq_flags, pc); 395 irq_flags, pc);
384} 396}
@@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
397 if (!tr->sys_refcount_enter) 409 if (!tr->sys_refcount_enter)
398 ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 410 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
399 if (!ret) { 411 if (!ret) {
400 set_bit(num, tr->enabled_enter_syscalls); 412 rcu_assign_pointer(tr->enter_syscall_files[num], file);
401 tr->sys_refcount_enter++; 413 tr->sys_refcount_enter++;
402 } 414 }
403 mutex_unlock(&syscall_trace_lock); 415 mutex_unlock(&syscall_trace_lock);
@@ -415,10 +427,15 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
415 return; 427 return;
416 mutex_lock(&syscall_trace_lock); 428 mutex_lock(&syscall_trace_lock);
417 tr->sys_refcount_enter--; 429 tr->sys_refcount_enter--;
418 clear_bit(num, tr->enabled_enter_syscalls); 430 rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
419 if (!tr->sys_refcount_enter) 431 if (!tr->sys_refcount_enter)
420 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 432 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
421 mutex_unlock(&syscall_trace_lock); 433 mutex_unlock(&syscall_trace_lock);
434 /*
435 * Callers expect the event to be completely disabled on
436 * return, so wait for current handlers to finish.
437 */
438 synchronize_sched();
422} 439}
423 440
424static int reg_event_syscall_exit(struct ftrace_event_file *file, 441static int reg_event_syscall_exit(struct ftrace_event_file *file,
@@ -435,7 +452,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
435 if (!tr->sys_refcount_exit) 452 if (!tr->sys_refcount_exit)
436 ret = register_trace_sys_exit(ftrace_syscall_exit, tr); 453 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
437 if (!ret) { 454 if (!ret) {
438 set_bit(num, tr->enabled_exit_syscalls); 455 rcu_assign_pointer(tr->exit_syscall_files[num], file);
439 tr->sys_refcount_exit++; 456 tr->sys_refcount_exit++;
440 } 457 }
441 mutex_unlock(&syscall_trace_lock); 458 mutex_unlock(&syscall_trace_lock);
@@ -453,13 +470,18 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
453 return; 470 return;
454 mutex_lock(&syscall_trace_lock); 471 mutex_lock(&syscall_trace_lock);
455 tr->sys_refcount_exit--; 472 tr->sys_refcount_exit--;
456 clear_bit(num, tr->enabled_exit_syscalls); 473 rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
457 if (!tr->sys_refcount_exit) 474 if (!tr->sys_refcount_exit)
458 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 475 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
459 mutex_unlock(&syscall_trace_lock); 476 mutex_unlock(&syscall_trace_lock);
477 /*
478 * Callers expect the event to be completely disabled on
479 * return, so wait for current handlers to finish.
480 */
481 synchronize_sched();
460} 482}
461 483
462static int init_syscall_trace(struct ftrace_event_call *call) 484static int __init init_syscall_trace(struct ftrace_event_call *call)
463{ 485{
464 int id; 486 int id;
465 int num; 487 int num;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 272261b5f94f..b6dcc42ef7f5 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
128 if (is_ret) 128 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher; 129 tu->consumer.ret_handler = uretprobe_dispatcher;
130 init_trace_uprobe_filter(&tu->filter); 130 init_trace_uprobe_filter(&tu->filter);
131 tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
131 return tu; 132 return tu;
132 133
133error: 134error:
@@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
561 for (i = 0; i < tu->nr_args; i++) 562 for (i = 0; i < tu->nr_args; i++)
562 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 563 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
563 564
564 if (!filter_current_check_discard(buffer, call, entry, event)) 565 if (!call_filter_check_discard(call, entry, buffer, event))
565 trace_buffer_unlock_commit(buffer, event, 0, 0); 566 trace_buffer_unlock_commit(buffer, event, 0, 0);
566} 567}
567 568
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f6c83d7ef000..602e5bbbceff 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!nsown_capable(CAP_SETGID)) 179 if (!ns_capable(current_user_ns(), CAP_SETGID))
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf7..509403e3fbc6 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,12 +10,75 @@
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
11 int wait) 11 int wait)
12{ 12{
13 unsigned long flags;
14
13 WARN_ON(cpu != 0); 15 WARN_ON(cpu != 0);
14 16
15 local_irq_disable(); 17 local_irq_save(flags);
16 (func)(info); 18 func(info);
17 local_irq_enable(); 19 local_irq_restore(flags);
18 20
19 return 0; 21 return 0;
20} 22}
21EXPORT_SYMBOL(smp_call_function_single); 23EXPORT_SYMBOL(smp_call_function_single);
24
25void __smp_call_function_single(int cpu, struct call_single_data *csd,
26 int wait)
27{
28 unsigned long flags;
29
30 local_irq_save(flags);
31 csd->func(csd->info);
32 local_irq_restore(flags);
33}
34EXPORT_SYMBOL(__smp_call_function_single);
35
36int on_each_cpu(smp_call_func_t func, void *info, int wait)
37{
38 unsigned long flags;
39
40 local_irq_save(flags);
41 func(info);
42 local_irq_restore(flags);
43 return 0;
44}
45EXPORT_SYMBOL(on_each_cpu);
46
47/*
48 * Note we still need to test the mask even for UP
49 * because we actually can get an empty mask from
50 * code that on SMP might call us without the local
51 * CPU in the mask.
52 */
53void on_each_cpu_mask(const struct cpumask *mask,
54 smp_call_func_t func, void *info, bool wait)
55{
56 unsigned long flags;
57
58 if (cpumask_test_cpu(0, mask)) {
59 local_irq_save(flags);
60 func(info);
61 local_irq_restore(flags);
62 }
63}
64EXPORT_SYMBOL(on_each_cpu_mask);
65
66/*
67 * Preemption is disabled here to make sure the cond_func is called under the
68 * same condtions in UP and SMP.
69 */
70void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
71 smp_call_func_t func, void *info, bool wait,
72 gfp_t gfp_flags)
73{
74 unsigned long flags;
75
76 preempt_disable();
77 if (cond_func(0, info)) {
78 local_irq_save(flags);
79 func(info);
80 local_irq_restore(flags);
81 }
82 preempt_enable();
83}
84EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/user.c b/kernel/user.c
index 69b4c3d48cde..a3a0dbfda329 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,8 +51,10 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54 .may_mount_sysfs = true, 54#ifdef CONFIG_KEYS_KERBEROS_CACHE
55 .may_mount_proc = true, 55 .krb_cache_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem),
57#endif
56}; 58};
57EXPORT_SYMBOL_GPL(init_user_ns); 59EXPORT_SYMBOL_GPL(init_user_ns);
58 60
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9064b919a406..240fb62cf394 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,8 +101,9 @@ int create_user_ns(struct cred *new)
101 101
102 set_cred_user_ns(new, ns); 102 set_cred_user_ns(new, ns);
103 103
104 update_mnt_policy(ns); 104#ifdef CONFIG_PERSISTENT_KEYRINGS
105 105 init_rwsem(&ns->persistent_keyring_register_sem);
106#endif
106 return 0; 107 return 0;
107} 108}
108 109
@@ -132,6 +133,9 @@ void free_user_ns(struct user_namespace *ns)
132 133
133 do { 134 do {
134 parent = ns->parent; 135 parent = ns->parent;
136#ifdef CONFIG_PERSISTENT_KEYRINGS
137 key_put(ns->persistent_keyring_register);
138#endif
135 proc_free_inum(ns->proc_inum); 139 proc_free_inum(ns->proc_inum);
136 kmem_cache_free(user_ns_cachep, ns); 140 kmem_cache_free(user_ns_cachep, ns);
137 ns = parent; 141 ns = parent;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 2fc8576efaa8..fd393124e507 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
114 struct uts_namespace *ns = new; 114 struct uts_namespace *ns = new;
115 115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN)) 117 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
118 return -EPERM; 118 return -EPERM;
119 119
120 get_uts_ns(ns); 120 get_uts_ns(ns);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 51c4f34d258e..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
486 .unpark = watchdog_enable, 486 .unpark = watchdog_enable,
487}; 487};
488 488
489static int watchdog_enable_all_cpus(void) 489static void restart_watchdog_hrtimer(void *info)
490{
491 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
492 int ret;
493
494 /*
495 * No need to cancel and restart hrtimer if it is currently executing
496 * because it will reprogram itself with the new period now.
497 * We should never see it unqueued here because we are running per-cpu
498 * with interrupts disabled.
499 */
500 ret = hrtimer_try_to_cancel(hrtimer);
501 if (ret == 1)
502 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
503 HRTIMER_MODE_REL_PINNED);
504}
505
506static void update_timers(int cpu)
507{
508 struct call_single_data data = {.func = restart_watchdog_hrtimer};
509 /*
510 * Make sure that perf event counter will adopt to a new
511 * sampling period. Updating the sampling period directly would
512 * be much nicer but we do not have an API for that now so
513 * let's use a big hammer.
514 * Hrtimer will adopt the new period on the next tick but this
515 * might be late already so we have to restart the timer as well.
516 */
517 watchdog_nmi_disable(cpu);
518 __smp_call_function_single(cpu, &data, 1);
519 watchdog_nmi_enable(cpu);
520}
521
522static void update_timers_all_cpus(void)
523{
524 int cpu;
525
526 get_online_cpus();
527 preempt_disable();
528 for_each_online_cpu(cpu)
529 update_timers(cpu);
530 preempt_enable();
531 put_online_cpus();
532}
533
534static int watchdog_enable_all_cpus(bool sample_period_changed)
490{ 535{
491 int err = 0; 536 int err = 0;
492 537
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
496 pr_err("Failed to create watchdog threads, disabled\n"); 541 pr_err("Failed to create watchdog threads, disabled\n");
497 else 542 else
498 watchdog_running = 1; 543 watchdog_running = 1;
544 } else if (sample_period_changed) {
545 update_timers_all_cpus();
499 } 546 }
500 547
501 return err; 548 return err;
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
520 void __user *buffer, size_t *lenp, loff_t *ppos) 567 void __user *buffer, size_t *lenp, loff_t *ppos)
521{ 568{
522 int err, old_thresh, old_enabled; 569 int err, old_thresh, old_enabled;
570 static DEFINE_MUTEX(watchdog_proc_mutex);
523 571
572 mutex_lock(&watchdog_proc_mutex);
524 old_thresh = ACCESS_ONCE(watchdog_thresh); 573 old_thresh = ACCESS_ONCE(watchdog_thresh);
525 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 574 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
526 575
527 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 576 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 if (err || !write) 577 if (err || !write)
529 return err; 578 goto out;
530 579
531 set_sample_period(); 580 set_sample_period();
532 /* 581 /*
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
535 * watchdog_*_all_cpus() function takes care of this. 584 * watchdog_*_all_cpus() function takes care of this.
536 */ 585 */
537 if (watchdog_user_enabled && watchdog_thresh) 586 if (watchdog_user_enabled && watchdog_thresh)
538 err = watchdog_enable_all_cpus(); 587 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
539 else 588 else
540 watchdog_disable_all_cpus(); 589 watchdog_disable_all_cpus();
541 590
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
544 watchdog_thresh = old_thresh; 593 watchdog_thresh = old_thresh;
545 watchdog_user_enabled = old_enabled; 594 watchdog_user_enabled = old_enabled;
546 } 595 }
547 596out:
597 mutex_unlock(&watchdog_proc_mutex);
548 return err; 598 return err;
549} 599}
550#endif /* CONFIG_SYSCTL */ 600#endif /* CONFIG_SYSCTL */
@@ -554,5 +604,5 @@ void __init lockup_detector_init(void)
554 set_sample_period(); 604 set_sample_period();
555 605
556 if (watchdog_user_enabled) 606 if (watchdog_user_enabled)
557 watchdog_enable_all_cpus(); 607 watchdog_enable_all_cpus(false);
558} 608}