aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-22 18:52:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-22 18:52:04 -0400
commit23b7776290b10297fe2cae0fb5f166a4f2c68121 (patch)
tree73d1e76644a20bc7bff80fbfdb08e8b9a9f28420
parent6bc4c3ad3619e1bcb4a6330e030007ace8ca465e (diff)
parent6fab54101923044712baee429ff573f03b99fc47 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes are: - lockless wakeup support for futexes and IPC message queues (Davidlohr Bueso, Peter Zijlstra) - Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability (Jason Low) - NUMA balancing improvements (Rik van Riel) - SCHED_DEADLINE improvements (Wanpeng Li) - clean up and reorganize preemption helpers (Frederic Weisbecker) - decouple page fault disabling machinery from the preemption counter, to improve debuggability and robustness (David Hildenbrand) - SCHED_DEADLINE documentation updates (Luca Abeni) - topology CPU masks cleanups (Bartosz Golaszewski) - /proc/sched_debug improvements (Srikar Dronamraju)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (79 commits) sched/deadline: Remove needless parameter in dl_runtime_exceeded() sched: Remove superfluous resetting of the p->dl_throttled flag sched/deadline: Drop duplicate init_sched_dl_class() declaration sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target sched/deadline: Make init_sched_dl_class() __init sched/deadline: Optimize pull_dl_task() sched/preempt: Add static_key() to preempt_notifiers sched/preempt: Fix preempt notifiers documentation about hlist_del() within unsafe iteration sched/stop_machine: Fix deadlock between multiple stop_two_cpus() sched/debug: Add sum_sleep_runtime to /proc/<pid>/sched sched/debug: Replace vruntime with wait_sum in /proc/sched_debug sched/debug: Properly format runnable tasks in /proc/sched_debug sched/numa: Only consider less busy nodes as numa balancing destinations Revert 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced") sched/fair: Prevent throttling in early pick_next_task_fair() preempt: Reorganize the notrace definitions a bit preempt: Use preempt_schedule_context() as the official tracing preemption point sched: Make preempt_schedule_context() function-tracing safe x86: Remove cpu_sibling_mask() and cpu_core_mask() x86: Replace cpu_**_mask() with topology_**_cpumask() ...
-rw-r--r--Documentation/cputopology.txt37
-rw-r--r--Documentation/scheduler/sched-deadline.txt184
-rw-r--r--arch/alpha/mm/fault.c5
-rw-r--r--arch/arc/include/asm/futex.h10
-rw-r--r--arch/arc/mm/fault.c2
-rw-r--r--arch/arm/include/asm/futex.h13
-rw-r--r--arch/arm/include/asm/topology.h2
-rw-r--r--arch/arm/mm/fault.c2
-rw-r--r--arch/arm/mm/highmem.c3
-rw-r--r--arch/arm64/include/asm/futex.h4
-rw-r--r--arch/arm64/include/asm/topology.h2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/avr32/include/asm/uaccess.h12
-rw-r--r--arch/avr32/mm/fault.c4
-rw-r--r--arch/cris/mm/fault.c6
-rw-r--r--arch/frv/mm/fault.c4
-rw-r--r--arch/frv/mm/highmem.c2
-rw-r--r--arch/hexagon/include/asm/uaccess.h3
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/ia64/mm/fault.c4
-rw-r--r--arch/m32r/include/asm/uaccess.h30
-rw-r--r--arch/m32r/mm/fault.c8
-rw-r--r--arch/m68k/include/asm/irqflags.h3
-rw-r--r--arch/m68k/mm/fault.c4
-rw-r--r--arch/metag/mm/fault.c2
-rw-r--r--arch/metag/mm/highmem.c4
-rw-r--r--arch/microblaze/include/asm/uaccess.h6
-rw-r--r--arch/microblaze/mm/fault.c8
-rw-r--r--arch/microblaze/mm/highmem.c4
-rw-r--r--arch/mips/include/asm/topology.h2
-rw-r--r--arch/mips/include/asm/uaccess.h45
-rw-r--r--arch/mips/kernel/signal-common.h9
-rw-r--r--arch/mips/mm/fault.c4
-rw-r--r--arch/mips/mm/highmem.c5
-rw-r--r--arch/mips/mm/init.c2
-rw-r--r--arch/mn10300/include/asm/highmem.h3
-rw-r--r--arch/mn10300/mm/fault.c4
-rw-r--r--arch/nios2/mm/fault.c2
-rw-r--r--arch/parisc/include/asm/cacheflush.h2
-rw-r--r--arch/parisc/kernel/traps.c4
-rw-r--r--arch/parisc/mm/fault.c4
-rw-r--r--arch/powerpc/include/asm/topology.h2
-rw-r--r--arch/powerpc/lib/vmx-helper.c11
-rw-r--r--arch/powerpc/mm/fault.c9
-rw-r--r--arch/powerpc/mm/highmem.c4
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
-rw-r--r--arch/s390/include/asm/topology.h3
-rw-r--r--arch/s390/include/asm/uaccess.h15
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/score/include/asm/uaccess.h15
-rw-r--r--arch/score/mm/fault.c3
-rw-r--r--arch/sh/mm/fault.c5
-rw-r--r--arch/sparc/include/asm/topology_64.h2
-rw-r--r--arch/sparc/mm/fault_32.c4
-rw-r--r--arch/sparc/mm/fault_64.c4
-rw-r--r--arch/sparc/mm/highmem.c4
-rw-r--r--arch/sparc/mm/init_64.c2
-rw-r--r--arch/tile/include/asm/topology.h2
-rw-r--r--arch/tile/include/asm/uaccess.h18
-rw-r--r--arch/tile/mm/fault.c4
-rw-r--r--arch/tile/mm/highmem.c3
-rw-r--r--arch/um/kernel/trap.c5
-rw-r--r--arch/unicore32/mm/fault.c2
-rw-r--r--arch/x86/include/asm/preempt.h8
-rw-r--r--arch/x86/include/asm/smp.h10
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/uaccess.h15
-rw-r--r--arch/x86/include/asm/uaccess_32.h6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c6
-rw-r--r--arch/x86/kernel/cpu/proc.c3
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c4
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/smpboot.c42
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c4
-rw-r--r--arch/x86/lib/thunk_32.S4
-rw-r--r--arch/x86/lib/thunk_64.S4
-rw-r--r--arch/x86/lib/usercopy_32.c6
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/x86/mm/highmem_32.c3
-rw-r--r--arch/x86/mm/iomap_32.c2
-rw-r--r--arch/xtensa/mm/fault.c4
-rw-r--r--arch/xtensa/mm/highmem.c2
-rw-r--r--block/blk-mq-cpumap.c2
-rw-r--r--drivers/acpi/acpi_pad.c2
-rw-r--r--drivers/base/topology.c2
-rw-r--r--drivers/cpufreq/acpi-cpufreq.c5
-rw-r--r--drivers/cpufreq/p4-clockmod.c2
-rw-r--r--drivers/cpufreq/powernow-k8.c13
-rw-r--r--drivers/cpufreq/speedstep-ich.c2
-rw-r--r--drivers/crypto/vmx/aes.c8
-rw-r--r--drivers/crypto/vmx/aes_cbc.c6
-rw-r--r--drivers/crypto/vmx/ghash.c8
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c3
-rw-r--r--drivers/hwmon/coretemp.c3
-rw-r--r--drivers/net/ethernet/sfc/efx.c2
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c2
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/service.c4
-rw-r--r--include/asm-generic/futex.h7
-rw-r--r--include/asm-generic/preempt.h7
-rw-r--r--include/linux/bottom_half.h1
-rw-r--r--include/linux/hardirq.h2
-rw-r--r--include/linux/highmem.h2
-rw-r--r--include/linux/init_task.h5
-rw-r--r--include/linux/io-mapping.h2
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/lglock.h5
-rw-r--r--include/linux/preempt.h159
-rw-r--r--include/linux/preempt_mask.h117
-rw-r--r--include/linux/sched.h118
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/linux/uaccess.h48
-rw-r--r--include/linux/wait.h17
-rw-r--r--include/trace/events/sched.h3
-rw-r--r--ipc/mqueue.c54
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/locking/lglock.c22
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c136
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c372
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/sched/stats.h15
-rw-r--r--kernel/sched/wait.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/stop_machine.c42
-rw-r--r--kernel/time/posix-cpu-timers.c87
-rw-r--r--lib/cpu_rmap.c2
-rw-r--r--lib/radix-tree.c2
-rw-r--r--lib/strnlen_user.c6
-rw-r--r--mm/memory.c18
138 files changed, 1442 insertions, 972 deletions
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 0aad6deb2d96..12b1b25b4da9 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -1,6 +1,6 @@
1 1
2Export CPU topology info via sysfs. Items (attributes) are similar 2Export CPU topology info via sysfs. Items (attributes) are similar
3to /proc/cpuinfo. 3to /proc/cpuinfo output of some architectures:
4 4
51) /sys/devices/system/cpu/cpuX/topology/physical_package_id: 51) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
6 6
@@ -23,20 +23,35 @@ to /proc/cpuinfo.
234) /sys/devices/system/cpu/cpuX/topology/thread_siblings: 234) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
24 24
25 internal kernel map of cpuX's hardware threads within the same 25 internal kernel map of cpuX's hardware threads within the same
26 core as cpuX 26 core as cpuX.
27 27
285) /sys/devices/system/cpu/cpuX/topology/core_siblings: 285) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
29
30 human-readable list of cpuX's hardware threads within the same
31 core as cpuX.
32
336) /sys/devices/system/cpu/cpuX/topology/core_siblings:
29 34
30 internal kernel map of cpuX's hardware threads within the same 35 internal kernel map of cpuX's hardware threads within the same
31 physical_package_id. 36 physical_package_id.
32 37
336) /sys/devices/system/cpu/cpuX/topology/book_siblings: 387) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
39
40 human-readable list of cpuX's hardware threads within the same
41 physical_package_id.
42
438) /sys/devices/system/cpu/cpuX/topology/book_siblings:
34 44
35 internal kernel map of cpuX's hardware threads within the same 45 internal kernel map of cpuX's hardware threads within the same
36 book_id. 46 book_id.
37 47
489) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
49
50 human-readable list of cpuX's hardware threads within the same
51 book_id.
52
38To implement it in an architecture-neutral way, a new source file, 53To implement it in an architecture-neutral way, a new source file,
39drivers/base/topology.c, is to export the 4 or 6 attributes. The two book 54drivers/base/topology.c, is to export the 6 or 9 attributes. The three book
40related sysfs files will only be created if CONFIG_SCHED_BOOK is selected. 55related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
41 56
42For an architecture to support this feature, it must define some of 57For an architecture to support this feature, it must define some of
@@ -44,20 +59,22 @@ these macros in include/asm-XXX/topology.h:
44#define topology_physical_package_id(cpu) 59#define topology_physical_package_id(cpu)
45#define topology_core_id(cpu) 60#define topology_core_id(cpu)
46#define topology_book_id(cpu) 61#define topology_book_id(cpu)
47#define topology_thread_cpumask(cpu) 62#define topology_sibling_cpumask(cpu)
48#define topology_core_cpumask(cpu) 63#define topology_core_cpumask(cpu)
49#define topology_book_cpumask(cpu) 64#define topology_book_cpumask(cpu)
50 65
51The type of **_id is int. 66The type of **_id macros is int.
52The type of siblings is (const) struct cpumask *. 67The type of **_cpumask macros is (const) struct cpumask *. The latter
68correspond with appropriate **_siblings sysfs attributes (except for
69topology_sibling_cpumask() which corresponds with thread_siblings).
53 70
54To be consistent on all architectures, include/linux/topology.h 71To be consistent on all architectures, include/linux/topology.h
55provides default definitions for any of the above macros that are 72provides default definitions for any of the above macros that are
56not defined by include/asm-XXX/topology.h: 73not defined by include/asm-XXX/topology.h:
571) physical_package_id: -1 741) physical_package_id: -1
582) core_id: 0 752) core_id: 0
593) thread_siblings: just the given CPU 763) sibling_cpumask: just the given CPU
604) core_siblings: just the given CPU 774) core_cpumask: just the given CPU
61 78
62For architectures that don't support books (CONFIG_SCHED_BOOK) there are no 79For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
63default definitions for topology_book_id() and topology_book_cpumask(). 80default definitions for topology_book_id() and topology_book_cpumask().
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 21461a0441c1..e114513a2731 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -8,6 +8,10 @@ CONTENTS
8 1. Overview 8 1. Overview
9 2. Scheduling algorithm 9 2. Scheduling algorithm
10 3. Scheduling Real-Time Tasks 10 3. Scheduling Real-Time Tasks
11 3.1 Definitions
12 3.2 Schedulability Analysis for Uniprocessor Systems
13 3.3 Schedulability Analysis for Multiprocessor Systems
14 3.4 Relationship with SCHED_DEADLINE Parameters
11 4. Bandwidth management 15 4. Bandwidth management
12 4.1 System-wide settings 16 4.1 System-wide settings
13 4.2 Task interface 17 4.2 Task interface
@@ -43,7 +47,7 @@ CONTENTS
43 "deadline", to schedule tasks. A SCHED_DEADLINE task should receive 47 "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
44 "runtime" microseconds of execution time every "period" microseconds, and 48 "runtime" microseconds of execution time every "period" microseconds, and
45 these "runtime" microseconds are available within "deadline" microseconds 49 these "runtime" microseconds are available within "deadline" microseconds
46 from the beginning of the period. In order to implement this behaviour, 50 from the beginning of the period. In order to implement this behavior,
47 every time the task wakes up, the scheduler computes a "scheduling deadline" 51 every time the task wakes up, the scheduler computes a "scheduling deadline"
48 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then 52 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
49 scheduled using EDF[1] on these scheduling deadlines (the task with the 53 scheduled using EDF[1] on these scheduling deadlines (the task with the
@@ -52,7 +56,7 @@ CONTENTS
52 "admission control" strategy (see Section "4. Bandwidth management") is used 56 "admission control" strategy (see Section "4. Bandwidth management") is used
53 (clearly, if the system is overloaded this guarantee cannot be respected). 57 (clearly, if the system is overloaded this guarantee cannot be respected).
54 58
55 Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so 59 Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
56 that each task runs for at most its runtime every period, avoiding any 60 that each task runs for at most its runtime every period, avoiding any
57 interference between different tasks (bandwidth isolation), while the EDF[1] 61 interference between different tasks (bandwidth isolation), while the EDF[1]
58 algorithm selects the task with the earliest scheduling deadline as the one 62 algorithm selects the task with the earliest scheduling deadline as the one
@@ -63,7 +67,7 @@ CONTENTS
63 In more details, the CBS algorithm assigns scheduling deadlines to 67 In more details, the CBS algorithm assigns scheduling deadlines to
64 tasks in the following way: 68 tasks in the following way:
65 69
66 - Each SCHED_DEADLINE task is characterised by the "runtime", 70 - Each SCHED_DEADLINE task is characterized by the "runtime",
67 "deadline", and "period" parameters; 71 "deadline", and "period" parameters;
68 72
69 - The state of the task is described by a "scheduling deadline", and 73 - The state of the task is described by a "scheduling deadline", and
@@ -78,7 +82,7 @@ CONTENTS
78 82
79 then, if the scheduling deadline is smaller than the current time, or 83 then, if the scheduling deadline is smaller than the current time, or
80 this condition is verified, the scheduling deadline and the 84 this condition is verified, the scheduling deadline and the
81 remaining runtime are re-initialised as 85 remaining runtime are re-initialized as
82 86
83 scheduling deadline = current time + deadline 87 scheduling deadline = current time + deadline
84 remaining runtime = runtime 88 remaining runtime = runtime
@@ -126,31 +130,37 @@ CONTENTS
126 suited for periodic or sporadic real-time tasks that need guarantees on their 130 suited for periodic or sporadic real-time tasks that need guarantees on their
127 timing behavior, e.g., multimedia, streaming, control applications, etc. 131 timing behavior, e.g., multimedia, streaming, control applications, etc.
128 132
1333.1 Definitions
134------------------------
135
129 A typical real-time task is composed of a repetition of computation phases 136 A typical real-time task is composed of a repetition of computation phases
130 (task instances, or jobs) which are activated on a periodic or sporadic 137 (task instances, or jobs) which are activated on a periodic or sporadic
131 fashion. 138 fashion.
132 Each job J_j (where J_j is the j^th job of the task) is characterised by an 139 Each job J_j (where J_j is the j^th job of the task) is characterized by an
133 arrival time r_j (the time when the job starts), an amount of computation 140 arrival time r_j (the time when the job starts), an amount of computation
134 time c_j needed to finish the job, and a job absolute deadline d_j, which 141 time c_j needed to finish the job, and a job absolute deadline d_j, which
135 is the time within which the job should be finished. The maximum execution 142 is the time within which the job should be finished. The maximum execution
136 time max_j{c_j} is called "Worst Case Execution Time" (WCET) for the task. 143 time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
137 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or 144 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
138 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, 145 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
139 d_j = r_j + D, where D is the task's relative deadline. 146 d_j = r_j + D, where D is the task's relative deadline.
140 The utilisation of a real-time task is defined as the ratio between its 147 Summing up, a real-time task can be described as
148 Task = (WCET, D, P)
149
150 The utilization of a real-time task is defined as the ratio between its
141 WCET and its period (or minimum inter-arrival time), and represents 151 WCET and its period (or minimum inter-arrival time), and represents
142 the fraction of CPU time needed to execute the task. 152 the fraction of CPU time needed to execute the task.
143 153
144 If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal 154 If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
145 to the number of CPUs), then the scheduler is unable to respect all the 155 to the number of CPUs), then the scheduler is unable to respect all the
146 deadlines. 156 deadlines.
147 Note that total utilisation is defined as the sum of the utilisations 157 Note that total utilization is defined as the sum of the utilizations
148 WCET_i/P_i over all the real-time tasks in the system. When considering 158 WCET_i/P_i over all the real-time tasks in the system. When considering
149 multiple real-time tasks, the parameters of the i-th task are indicated 159 multiple real-time tasks, the parameters of the i-th task are indicated
150 with the "_i" suffix. 160 with the "_i" suffix.
151 Moreover, if the total utilisation is larger than M, then we risk starving 161 Moreover, if the total utilization is larger than M, then we risk starving
152 non- real-time tasks by real-time tasks. 162 non- real-time tasks by real-time tasks.
153 If, instead, the total utilisation is smaller than M, then non real-time 163 If, instead, the total utilization is smaller than M, then non real-time
154 tasks will not be starved and the system might be able to respect all the 164 tasks will not be starved and the system might be able to respect all the
155 deadlines. 165 deadlines.
156 As a matter of fact, in this case it is possible to provide an upper bound 166 As a matter of fact, in this case it is possible to provide an upper bound
@@ -159,38 +169,119 @@ CONTENTS
159 More precisely, it can be proven that using a global EDF scheduler the 169 More precisely, it can be proven that using a global EDF scheduler the
160 maximum tardiness of each task is smaller or equal than 170 maximum tardiness of each task is smaller or equal than
161 ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max 171 ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
162 where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i} 172 where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
163 is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation. 173 is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
174 utilization[12].
175
1763.2 Schedulability Analysis for Uniprocessor Systems
177------------------------
164 178
165 If M=1 (uniprocessor system), or in case of partitioned scheduling (each 179 If M=1 (uniprocessor system), or in case of partitioned scheduling (each
166 real-time task is statically assigned to one and only one CPU), it is 180 real-time task is statically assigned to one and only one CPU), it is
167 possible to formally check if all the deadlines are respected. 181 possible to formally check if all the deadlines are respected.
168 If D_i = P_i for all tasks, then EDF is able to respect all the deadlines 182 If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
169 of all the tasks executing on a CPU if and only if the total utilisation 183 of all the tasks executing on a CPU if and only if the total utilization
170 of the tasks running on such a CPU is smaller or equal than 1. 184 of the tasks running on such a CPU is smaller or equal than 1.
171 If D_i != P_i for some task, then it is possible to define the density of 185 If D_i != P_i for some task, then it is possible to define the density of
172 a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines 186 a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
173 of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the 187 of all the tasks running on a CPU if the sum of the densities of the tasks
174 densities of the tasks running on such a CPU is smaller or equal than 1 188 running on such a CPU is smaller or equal than 1:
175 (notice that this condition is only sufficient, and not necessary). 189 sum(WCET_i / min{D_i, P_i}) <= 1
190 It is important to notice that this condition is only sufficient, and not
191 necessary: there are task sets that are schedulable, but do not respect the
192 condition. For example, consider the task set {Task_1,Task_2} composed by
193 Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
194 EDF is clearly able to schedule the two tasks without missing any deadline
195 (Task_1 is scheduled as soon as it is released, and finishes just in time
196 to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
197 its response time cannot be larger than 50ms + 10ms = 60ms) even if
198 50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
199 Of course it is possible to test the exact schedulability of tasks with
200 D_i != P_i (checking a condition that is both sufficient and necessary),
201 but this cannot be done by comparing the total utilization or density with
202 a constant. Instead, the so called "processor demand" approach can be used,
203 computing the total amount of CPU time h(t) needed by all the tasks to
204 respect all of their deadlines in a time interval of size t, and comparing
205 such a time with the interval size t. If h(t) is smaller than t (that is,
206 the amount of time needed by the tasks in a time interval of size t is
207 smaller than the size of the interval) for all the possible values of t, then
208 EDF is able to schedule the tasks respecting all of their deadlines. Since
209 performing this check for all possible values of t is impossible, it has been
210 proven[4,5,6] that it is sufficient to perform the test for values of t
211 between 0 and a maximum value L. The cited papers contain all of the
212 mathematical details and explain how to compute h(t) and L.
213 In any case, this kind of analysis is too complex as well as too
214 time-consuming to be performed on-line. Hence, as explained in Section
215 4 Linux uses an admission test based on the tasks' utilizations.
216
2173.3 Schedulability Analysis for Multiprocessor Systems
218------------------------
176 219
177 On multiprocessor systems with global EDF scheduling (non partitioned 220 On multiprocessor systems with global EDF scheduling (non partitioned
178 systems), a sufficient test for schedulability can not be based on the 221 systems), a sufficient test for schedulability can not be based on the
179 utilisations (it can be shown that task sets with utilisations slightly 222 utilizations or densities: it can be shown that even if D_i = P_i task
180 larger than 1 can miss deadlines regardless of the number of CPUs M). 223 sets with utilizations slightly larger than 1 can miss deadlines regardless
181 However, as previously stated, enforcing that the total utilisation is smaller 224 of the number of CPUs.
182 than M is enough to guarantee that non real-time tasks are not starved and 225
183 that the tardiness of real-time tasks has an upper bound. 226 Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
227 CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
228 and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
229 arbitrarily small worst case execution time (indicated as "e" here) and a
230 period smaller than the one of the first task. Hence, if all the tasks
231 activate at the same time t, global EDF schedules these M tasks first
232 (because their absolute deadlines are equal to t + P - 1, hence they are
233 smaller than the absolute deadline of Task_1, which is t + P). As a
234 result, Task_1 can be scheduled only at time t + e, and will finish at
235 time t + e + P, after its absolute deadline. The total utilization of the
236 task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
237 values of e this can become very close to 1. This is known as "Dhall's
238 effect"[7]. Note: the example in the original paper by Dhall has been
239 slightly simplified here (for example, Dhall more correctly computed
240 lim_{e->0}U).
241
242 More complex schedulability tests for global EDF have been developed in
243 real-time literature[8,9], but they are not based on a simple comparison
244 between total utilization (or density) and a fixed constant. If all tasks
245 have D_i = P_i, a sufficient schedulability condition can be expressed in
246 a simple way:
247 sum(WCET_i / P_i) <= M - (M - 1) · U_max
248 where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
249 M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
250 just confirms the Dhall's effect. A more complete survey of the literature
251 about schedulability tests for multi-processor real-time scheduling can be
252 found in [11].
253
254 As seen, enforcing that the total utilization is smaller than M does not
255 guarantee that global EDF schedules the tasks without missing any deadline
256 (in other words, global EDF is not an optimal scheduling algorithm). However,
257 a total utilization smaller than M is enough to guarantee that non real-time
258 tasks are not starved and that the tardiness of real-time tasks has an upper
259 bound[12] (as previously noted). Different bounds on the maximum tardiness
260 experienced by real-time tasks have been developed in various papers[13,14],
261 but the theoretical result that is important for SCHED_DEADLINE is that if
262 the total utilization is smaller or equal than M then the response times of
263 the tasks are limited.
264
2653.4 Relationship with SCHED_DEADLINE Parameters
266------------------------
184 267
185 SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that 268 Finally, it is important to understand the relationship between the
186 the jobs' deadlines of a task are respected. In order to do this, a task 269 SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
187 must be scheduled by setting: 270 deadline and period) and the real-time task parameters (WCET, D, P)
271 described in this section. Note that the tasks' temporal constraints are
272 represented by its absolute deadlines d_j = r_j + D described above, while
273 SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
274 Section 2).
275 If an admission test is used to guarantee that the scheduling deadlines
276 are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
277 guaranteeing that all the jobs' deadlines of a task are respected.
278 In order to do this, a task must be scheduled by setting:
188 279
189 - runtime >= WCET 280 - runtime >= WCET
190 - deadline = D 281 - deadline = D
191 - period <= P 282 - period <= P
192 283
193 IOW, if runtime >= WCET and if period is >= P, then the scheduling deadlines 284 IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
194 and the absolute deadlines (d_j) coincide, so a proper admission control 285 and the absolute deadlines (d_j) coincide, so a proper admission control
195 allows to respect the jobs' absolute deadlines for this task (this is what is 286 allows to respect the jobs' absolute deadlines for this task (this is what is
196 called "hard schedulability property" and is an extension of Lemma 1 of [2]). 287 called "hard schedulability property" and is an extension of Lemma 1 of [2]).
@@ -206,6 +297,39 @@ CONTENTS
206 Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf 297 Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
207 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab 298 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
208 Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf 299 Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
300 4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
301 Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
302 no. 3, pp. 115-118, 1980.
303 5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
304 Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
305 11th IEEE Real-time Systems Symposium, 1990.
306 6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
307 Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
308 One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
309 1990.
310 7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
311 research, vol. 26, no. 1, pp 127-140, 1978.
312 8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
313 Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
314 9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
315 IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
316 pp 760-768, 2005.
317 10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
318 Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
319 vol. 25, no. 2–3, pp. 187–205, 2003.
320 11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
321 Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
322 http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
323 12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
324 Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
325 no. 2, pp 133-189, 2008.
326 13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
327 Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
328 the 26th IEEE Real-Time Systems Symposium, 2005.
329 14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
330 Global EDF. Proceedings of the 22nd Euromicro Conference on
331 Real-Time Systems, 2010.
332
209 333
2104. Bandwidth management 3344. Bandwidth management
211======================= 335=======================
@@ -218,10 +342,10 @@ CONTENTS
218 no guarantee can be given on the actual scheduling of the -deadline tasks. 342 no guarantee can be given on the actual scheduling of the -deadline tasks.
219 343
220 As already stated in Section 3, a necessary condition to be respected to 344 As already stated in Section 3, a necessary condition to be respected to
221 correctly schedule a set of real-time tasks is that the total utilisation 345 correctly schedule a set of real-time tasks is that the total utilization
222 is smaller than M. When talking about -deadline tasks, this requires that 346 is smaller than M. When talking about -deadline tasks, this requires that
223 the sum of the ratio between runtime and period for all tasks is smaller 347 the sum of the ratio between runtime and period for all tasks is smaller
224 than M. Notice that the ratio runtime/period is equivalent to the utilisation 348 than M. Notice that the ratio runtime/period is equivalent to the utilization
225 of a "traditional" real-time task, and is also often referred to as 349 of a "traditional" real-time task, and is also often referred to as
226 "bandwidth". 350 "bandwidth".
227 The interface used to control the CPU bandwidth that can be allocated 351 The interface used to control the CPU bandwidth that can be allocated
@@ -251,7 +375,7 @@ CONTENTS
251 The system wide settings are configured under the /proc virtual file system. 375 The system wide settings are configured under the /proc virtual file system.
252 376
253 For now the -rt knobs are used for -deadline admission control and the 377 For now the -rt knobs are used for -deadline admission control and the
254 -deadline runtime is accounted against the -rt runtime. We realise that this 378 -deadline runtime is accounted against the -rt runtime. We realize that this
255 isn't entirely desirable; however, it is better to have a small interface for 379 isn't entirely desirable; however, it is better to have a small interface for
256 now, and be able to change it easily later. The ideal situation (see 5.) is to 380 now, and be able to change it easily later. The ideal situation (see 5.) is to
257 run -rt tasks from a -deadline server; in which case the -rt bandwidth is a 381 run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 9d0ac091a52a..4a905bd667e2 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -23,8 +23,7 @@
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/module.h> 25#include <linux/module.h>
26 26#include <linux/uaccess.h>
27#include <asm/uaccess.h>
28 27
29extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *); 28extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
30 29
@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
107 106
108 /* If we're in an interrupt context, or have no user context, 107 /* If we're in an interrupt context, or have no user context,
109 we must not take the fault. */ 108 we must not take the fault. */
110 if (!mm || in_atomic()) 109 if (!mm || faulthandler_disabled())
111 goto no_context; 110 goto no_context;
112 111
113#ifdef CONFIG_ALPHA_LARGE_VMALLOC 112#ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 4dc64ddebece..05b5aaf5b0f9 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
53 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 53 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
54 return -EFAULT; 54 return -EFAULT;
55 55
56 pagefault_disable(); /* implies preempt_disable() */ 56 pagefault_disable();
57 57
58 switch (op) { 58 switch (op) {
59 case FUTEX_OP_SET: 59 case FUTEX_OP_SET:
@@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
75 ret = -ENOSYS; 75 ret = -ENOSYS;
76 } 76 }
77 77
78 pagefault_enable(); /* subsumes preempt_enable() */ 78 pagefault_enable();
79 79
80 if (!ret) { 80 if (!ret) {
81 switch (cmp) { 81 switch (cmp) {
@@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
104 return ret; 104 return ret;
105} 105}
106 106
107/* Compare-xchg with preemption disabled. 107/* Compare-xchg with pagefaults disabled.
108 * Notes: 108 * Notes:
109 * -Best-Effort: Exchg happens only if compare succeeds. 109 * -Best-Effort: Exchg happens only if compare succeeds.
110 * If compare fails, returns; leaving retry/looping to upper layers 110 * If compare fails, returns; leaving retry/looping to upper layers
@@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
121 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 121 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
122 return -EFAULT; 122 return -EFAULT;
123 123
124 pagefault_disable(); /* implies preempt_disable() */ 124 pagefault_disable();
125 125
126 /* TBD : can use llock/scond */ 126 /* TBD : can use llock/scond */
127 __asm__ __volatile__( 127 __asm__ __volatile__(
@@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
142 : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) 142 : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
143 : "cc", "memory"); 143 : "cc", "memory");
144 144
145 pagefault_enable(); /* subsumes preempt_enable() */ 145 pagefault_enable();
146 146
147 *uval = val; 147 *uval = val;
148 return val; 148 return val;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6a2e006cbcce..d948e4e9d89c 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
86 * If we're in an interrupt or have no user 86 * If we're in an interrupt or have no user
87 * context, we must not take the fault.. 87 * context, we must not take the fault..
88 */ 88 */
89 if (in_atomic() || !mm) 89 if (faulthandler_disabled() || !mm)
90 goto no_context; 90 goto no_context;
91 91
92 if (user_mode(regs)) 92 if (user_mode(regs))
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 4e78065a16aa..5eed82809d82 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
93 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 93 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
94 return -EFAULT; 94 return -EFAULT;
95 95
96 preempt_disable();
96 __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" 97 __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
97 "1: " TUSER(ldr) " %1, [%4]\n" 98 "1: " TUSER(ldr) " %1, [%4]\n"
98 " teq %1, %2\n" 99 " teq %1, %2\n"
@@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
104 : "cc", "memory"); 105 : "cc", "memory");
105 106
106 *uval = val; 107 *uval = val;
108 preempt_enable();
109
107 return ret; 110 return ret;
108} 111}
109 112
@@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
124 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 127 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
125 return -EFAULT; 128 return -EFAULT;
126 129
127 pagefault_disable(); /* implies preempt_disable() */ 130#ifndef CONFIG_SMP
131 preempt_disable();
132#endif
133 pagefault_disable();
128 134
129 switch (op) { 135 switch (op) {
130 case FUTEX_OP_SET: 136 case FUTEX_OP_SET:
@@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
146 ret = -ENOSYS; 152 ret = -ENOSYS;
147 } 153 }
148 154
149 pagefault_enable(); /* subsumes preempt_enable() */ 155 pagefault_enable();
156#ifndef CONFIG_SMP
157 preempt_enable();
158#endif
150 159
151 if (!ret) { 160 if (!ret) {
152 switch (cmp) { 161 switch (cmp) {
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 2fe85fff5cca..370f7a732900 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
18#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) 18#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id)
19#define topology_core_id(cpu) (cpu_topology[cpu].core_id) 19#define topology_core_id(cpu) (cpu_topology[cpu].core_id)
20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 21#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
22 22
23void init_cpu_topology(void); 23void init_cpu_topology(void);
24void store_cpu_topology(unsigned int cpuid); 24void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 6333d9c17875..0d629b8f973f 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
276 * If we're in an interrupt or have no user 276 * If we're in an interrupt or have no user
277 * context, we must not take the fault.. 277 * context, we must not take the fault..
278 */ 278 */
279 if (in_atomic() || !mm) 279 if (faulthandler_disabled() || !mm)
280 goto no_context; 280 goto no_context;
281 281
282 if (user_mode(regs)) 282 if (user_mode(regs))
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index b98895d9fe57..ee8dfa793989 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
59 void *kmap; 59 void *kmap;
60 int type; 60 int type;
61 61
62 preempt_disable();
62 pagefault_disable(); 63 pagefault_disable();
63 if (!PageHighMem(page)) 64 if (!PageHighMem(page))
64 return page_address(page); 65 return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
121 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); 122 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
122 } 123 }
123 pagefault_enable(); 124 pagefault_enable();
125 preempt_enable();
124} 126}
125EXPORT_SYMBOL(__kunmap_atomic); 127EXPORT_SYMBOL(__kunmap_atomic);
126 128
@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
130 int idx, type; 132 int idx, type;
131 struct page *page = pfn_to_page(pfn); 133 struct page *page = pfn_to_page(pfn);
132 134
135 preempt_disable();
133 pagefault_disable(); 136 pagefault_disable();
134 if (!PageHighMem(page)) 137 if (!PageHighMem(page))
135 return page_address(page); 138 return page_address(page);
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f750dc96e0f..74069b3bd919 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
58 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 58 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
59 return -EFAULT; 59 return -EFAULT;
60 60
61 pagefault_disable(); /* implies preempt_disable() */ 61 pagefault_disable();
62 62
63 switch (op) { 63 switch (op) {
64 case FUTEX_OP_SET: 64 case FUTEX_OP_SET:
@@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
85 ret = -ENOSYS; 85 ret = -ENOSYS;
86 } 86 }
87 87
88 pagefault_enable(); /* subsumes preempt_enable() */ 88 pagefault_enable();
89 89
90 if (!ret) { 90 if (!ret) {
91 switch (cmp) { 91 switch (cmp) {
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51c..225ec3524fbf 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
18#define topology_physical_package_id(cpu) (cpu_topology[cpu].cluster_id) 18#define topology_physical_package_id(cpu) (cpu_topology[cpu].cluster_id)
19#define topology_core_id(cpu) (cpu_topology[cpu].core_id) 19#define topology_core_id(cpu) (cpu_topology[cpu].core_id)
20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 21#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
22 22
23void init_cpu_topology(void); 23void init_cpu_topology(void);
24void store_cpu_topology(unsigned int cpuid); 24void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 96da13167d4a..0948d327d013 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
211 * If we're in an interrupt or have no user context, we must not take 211 * If we're in an interrupt or have no user context, we must not take
212 * the fault. 212 * the fault.
213 */ 213 */
214 if (in_atomic() || !mm) 214 if (faulthandler_disabled() || !mm)
215 goto no_context; 215 goto no_context;
216 216
217 if (user_mode(regs)) 217 if (user_mode(regs))
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index a46f7cf3e1ea..68cf638faf48 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
97 * @x: Value to copy to user space. 97 * @x: Value to copy to user space.
98 * @ptr: Destination address, in user space. 98 * @ptr: Destination address, in user space.
99 * 99 *
100 * Context: User context only. This function may sleep. 100 * Context: User context only. This function may sleep if pagefaults are
101 * enabled.
101 * 102 *
102 * This macro copies a single simple value from kernel space to user 103 * This macro copies a single simple value from kernel space to user
103 * space. It supports simple types like char and int, but not larger 104 * space. It supports simple types like char and int, but not larger
@@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
116 * @x: Variable to store result. 117 * @x: Variable to store result.
117 * @ptr: Source address, in user space. 118 * @ptr: Source address, in user space.
118 * 119 *
119 * Context: User context only. This function may sleep. 120 * Context: User context only. This function may sleep if pagefaults are
121 * enabled.
120 * 122 *
121 * This macro copies a single simple variable from user space to kernel 123 * This macro copies a single simple variable from user space to kernel
122 * space. It supports simple types like char and int, but not larger 124 * space. It supports simple types like char and int, but not larger
@@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
136 * @x: Value to copy to user space. 138 * @x: Value to copy to user space.
137 * @ptr: Destination address, in user space. 139 * @ptr: Destination address, in user space.
138 * 140 *
139 * Context: User context only. This function may sleep. 141 * Context: User context only. This function may sleep if pagefaults are
142 * enabled.
140 * 143 *
141 * This macro copies a single simple value from kernel space to user 144 * This macro copies a single simple value from kernel space to user
142 * space. It supports simple types like char and int, but not larger 145 * space. It supports simple types like char and int, but not larger
@@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
158 * @x: Variable to store result. 161 * @x: Variable to store result.
159 * @ptr: Source address, in user space. 162 * @ptr: Source address, in user space.
160 * 163 *
161 * Context: User context only. This function may sleep. 164 * Context: User context only. This function may sleep if pagefaults are
165 * enabled.
162 * 166 *
163 * This macro copies a single simple variable from user space to kernel 167 * This macro copies a single simple variable from user space to kernel
164 * space. It supports simple types like char and int, but not larger 168 * space. It supports simple types like char and int, but not larger
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index d223a8b57c1e..c03533937a9f 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -14,11 +14,11 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/kdebug.h> 15#include <linux/kdebug.h>
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/uaccess.h>
17 18
18#include <asm/mmu_context.h> 19#include <asm/mmu_context.h>
19#include <asm/sysreg.h> 20#include <asm/sysreg.h>
20#include <asm/tlb.h> 21#include <asm/tlb.h>
21#include <asm/uaccess.h>
22 22
23#ifdef CONFIG_KPROBES 23#ifdef CONFIG_KPROBES
24static inline int notify_page_fault(struct pt_regs *regs, int trap) 24static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
81 * If we're in an interrupt or have no user context, we must 81 * If we're in an interrupt or have no user context, we must
82 * not take the fault... 82 * not take the fault...
83 */ 83 */
84 if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM)) 84 if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
85 goto no_context; 85 goto no_context;
86 86
87 local_irq_enable(); 87 local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 83f12f2ed9e3..3066d40a6db1 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,7 +8,7 @@
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <asm/uaccess.h> 11#include <linux/uaccess.h>
12#include <arch/system.h> 12#include <arch/system.h>
13 13
14extern int find_fixup_code(struct pt_regs *); 14extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
109 info.si_code = SEGV_MAPERR; 109 info.si_code = SEGV_MAPERR;
110 110
111 /* 111 /*
112 * If we're in an interrupt or "atomic" operation or have no 112 * If we're in an interrupt, have pagefaults disabled or have no
113 * user context, we must not take the fault. 113 * user context, we must not take the fault.
114 */ 114 */
115 115
116 if (in_atomic() || !mm) 116 if (faulthandler_disabled() || !mm)
117 goto no_context; 117 goto no_context;
118 118
119 if (user_mode(regs)) 119 if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index ec4917ddf678..61d99767fe16 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -19,9 +19,9 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/ptrace.h> 20#include <linux/ptrace.h>
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/uaccess.h>
22 23
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/uaccess.h>
25#include <asm/gdb-stub.h> 25#include <asm/gdb-stub.h>
26 26
27/*****************************************************************************/ 27/*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
78 * If we're in an interrupt or have no user 78 * If we're in an interrupt or have no user
79 * context, we must not take the fault.. 79 * context, we must not take the fault..
80 */ 80 */
81 if (in_atomic() || !mm) 81 if (faulthandler_disabled() || !mm)
82 goto no_context; 82 goto no_context;
83 83
84 if (user_mode(__frame)) 84 if (user_mode(__frame))
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index bed9a9bd3c10..785344bbdc07 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
42 unsigned long paddr; 42 unsigned long paddr;
43 int type; 43 int type;
44 44
45 preempt_disable();
45 pagefault_disable(); 46 pagefault_disable();
46 type = kmap_atomic_idx_push(); 47 type = kmap_atomic_idx_push();
47 paddr = page_to_phys(page); 48 paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
85 } 86 }
86 kmap_atomic_idx_pop(); 87 kmap_atomic_idx_pop();
87 pagefault_enable(); 88 pagefault_enable();
89 preempt_enable();
88} 90}
89EXPORT_SYMBOL(__kunmap_atomic); 91EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index e4127e4d6a5b..f000a382bc7f 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -36,7 +36,8 @@
36 * @addr: User space pointer to start of block to check 36 * @addr: User space pointer to start of block to check
37 * @size: Size of block to check 37 * @size: Size of block to check
38 * 38 *
39 * Context: User context only. This function may sleep. 39 * Context: User context only. This function may sleep if pagefaults are
40 * enabled.
40 * 41 *
41 * Checks if a pointer to a block of memory in user space is valid. 42 * Checks if a pointer to a block of memory in user space is valid.
42 * 43 *
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 6437ca21f61b..3ad8f6988363 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
53#define topology_physical_package_id(cpu) (cpu_data(cpu)->socket_id) 53#define topology_physical_package_id(cpu) (cpu_data(cpu)->socket_id)
54#define topology_core_id(cpu) (cpu_data(cpu)->core_id) 54#define topology_core_id(cpu) (cpu_data(cpu)->core_id)
55#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 55#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
56#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) 56#define topology_sibling_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
57#endif 57#endif
58 58
59extern void arch_fix_phys_package_id(int num, u32 slot); 59extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ba5ba7accd0d..70b40d1205a6 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -11,10 +11,10 @@
11#include <linux/kprobes.h> 11#include <linux/kprobes.h>
12#include <linux/kdebug.h> 12#include <linux/kdebug.h>
13#include <linux/prefetch.h> 13#include <linux/prefetch.h>
14#include <linux/uaccess.h>
14 15
15#include <asm/pgtable.h> 16#include <asm/pgtable.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/uaccess.h>
18 18
19extern int die(char *, struct pt_regs *, long); 19extern int die(char *, struct pt_regs *, long);
20 20
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
96 /* 96 /*
97 * If we're in an interrupt or have no user context, we must not take the fault.. 97 * If we're in an interrupt or have no user context, we must not take the fault..
98 */ 98 */
99 if (in_atomic() || !mm) 99 if (faulthandler_disabled() || !mm)
100 goto no_context; 100 goto no_context;
101 101
102#ifdef CONFIG_VIRTUAL_MEM_MAP 102#ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 71adff209405..cac7014daef3 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -91,7 +91,8 @@ static inline void set_fs(mm_segment_t s)
91 * @addr: User space pointer to start of block to check 91 * @addr: User space pointer to start of block to check
92 * @size: Size of block to check 92 * @size: Size of block to check
93 * 93 *
94 * Context: User context only. This function may sleep. 94 * Context: User context only. This function may sleep if pagefaults are
95 * enabled.
95 * 96 *
96 * Checks if a pointer to a block of memory in user space is valid. 97 * Checks if a pointer to a block of memory in user space is valid.
97 * 98 *
@@ -155,7 +156,8 @@ extern int fixup_exception(struct pt_regs *regs);
155 * @x: Variable to store result. 156 * @x: Variable to store result.
156 * @ptr: Source address, in user space. 157 * @ptr: Source address, in user space.
157 * 158 *
158 * Context: User context only. This function may sleep. 159 * Context: User context only. This function may sleep if pagefaults are
160 * enabled.
159 * 161 *
160 * This macro copies a single simple variable from user space to kernel 162 * This macro copies a single simple variable from user space to kernel
161 * space. It supports simple types like char and int, but not larger 163 * space. It supports simple types like char and int, but not larger
@@ -175,7 +177,8 @@ extern int fixup_exception(struct pt_regs *regs);
175 * @x: Value to copy to user space. 177 * @x: Value to copy to user space.
176 * @ptr: Destination address, in user space. 178 * @ptr: Destination address, in user space.
177 * 179 *
178 * Context: User context only. This function may sleep. 180 * Context: User context only. This function may sleep if pagefaults are
181 * enabled.
179 * 182 *
180 * This macro copies a single simple value from kernel space to user 183 * This macro copies a single simple value from kernel space to user
181 * space. It supports simple types like char and int, but not larger 184 * space. It supports simple types like char and int, but not larger
@@ -194,7 +197,8 @@ extern int fixup_exception(struct pt_regs *regs);
194 * @x: Variable to store result. 197 * @x: Variable to store result.
195 * @ptr: Source address, in user space. 198 * @ptr: Source address, in user space.
196 * 199 *
197 * Context: User context only. This function may sleep. 200 * Context: User context only. This function may sleep if pagefaults are
201 * enabled.
198 * 202 *
199 * This macro copies a single simple variable from user space to kernel 203 * This macro copies a single simple variable from user space to kernel
200 * space. It supports simple types like char and int, but not larger 204 * space. It supports simple types like char and int, but not larger
@@ -274,7 +278,8 @@ do { \
274 * @x: Value to copy to user space. 278 * @x: Value to copy to user space.
275 * @ptr: Destination address, in user space. 279 * @ptr: Destination address, in user space.
276 * 280 *
277 * Context: User context only. This function may sleep. 281 * Context: User context only. This function may sleep if pagefaults are
282 * enabled.
278 * 283 *
279 * This macro copies a single simple value from kernel space to user 284 * This macro copies a single simple value from kernel space to user
280 * space. It supports simple types like char and int, but not larger 285 * space. It supports simple types like char and int, but not larger
@@ -568,7 +573,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
568 * @from: Source address, in kernel space. 573 * @from: Source address, in kernel space.
569 * @n: Number of bytes to copy. 574 * @n: Number of bytes to copy.
570 * 575 *
571 * Context: User context only. This function may sleep. 576 * Context: User context only. This function may sleep if pagefaults are
577 * enabled.
572 * 578 *
573 * Copy data from kernel space to user space. Caller must check 579 * Copy data from kernel space to user space. Caller must check
574 * the specified block with access_ok() before calling this function. 580 * the specified block with access_ok() before calling this function.
@@ -588,7 +594,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
588 * @from: Source address, in kernel space. 594 * @from: Source address, in kernel space.
589 * @n: Number of bytes to copy. 595 * @n: Number of bytes to copy.
590 * 596 *
591 * Context: User context only. This function may sleep. 597 * Context: User context only. This function may sleep if pagefaults are
598 * enabled.
592 * 599 *
593 * Copy data from kernel space to user space. 600 * Copy data from kernel space to user space.
594 * 601 *
@@ -606,7 +613,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
606 * @from: Source address, in user space. 613 * @from: Source address, in user space.
607 * @n: Number of bytes to copy. 614 * @n: Number of bytes to copy.
608 * 615 *
609 * Context: User context only. This function may sleep. 616 * Context: User context only. This function may sleep if pagefaults are
617 * enabled.
610 * 618 *
611 * Copy data from user space to kernel space. Caller must check 619 * Copy data from user space to kernel space. Caller must check
612 * the specified block with access_ok() before calling this function. 620 * the specified block with access_ok() before calling this function.
@@ -626,7 +634,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
626 * @from: Source address, in user space. 634 * @from: Source address, in user space.
627 * @n: Number of bytes to copy. 635 * @n: Number of bytes to copy.
628 * 636 *
629 * Context: User context only. This function may sleep. 637 * Context: User context only. This function may sleep if pagefaults are
638 * enabled.
630 * 639 *
631 * Copy data from user space to kernel space. 640 * Copy data from user space to kernel space.
632 * 641 *
@@ -677,7 +686,8 @@ unsigned long clear_user(void __user *mem, unsigned long len);
677 * strlen_user: - Get the size of a string in user space. 686 * strlen_user: - Get the size of a string in user space.
678 * @str: The string to measure. 687 * @str: The string to measure.
679 * 688 *
680 * Context: User context only. This function may sleep. 689 * Context: User context only. This function may sleep if pagefaults are
690 * enabled.
681 * 691 *
682 * Get the size of a NUL-terminated string in user space. 692 * Get the size of a NUL-terminated string in user space.
683 * 693 *
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e3d4d4890104..8f9875b7933d 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -24,9 +24,9 @@
24#include <linux/vt_kern.h> /* For unblank_screen() */ 24#include <linux/vt_kern.h> /* For unblank_screen() */
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/uaccess.h>
27 28
28#include <asm/m32r.h> 29#include <asm/m32r.h>
29#include <asm/uaccess.h>
30#include <asm/hardirq.h> 30#include <asm/hardirq.h>
31#include <asm/mmu_context.h> 31#include <asm/mmu_context.h>
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
@@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
111 mm = tsk->mm; 111 mm = tsk->mm;
112 112
113 /* 113 /*
114 * If we're in an interrupt or have no user context or are running in an 114 * If we're in an interrupt or have no user context or have pagefaults
115 * atomic region then we must not take the fault.. 115 * disabled then we must not take the fault.
116 */ 116 */
117 if (in_atomic() || !mm) 117 if (faulthandler_disabled() || !mm)
118 goto bad_area_nosemaphore; 118 goto bad_area_nosemaphore;
119 119
120 if (error_code & ACE_USERMODE) 120 if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h
index a823cd73dc09..b5941818346f 100644
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -2,9 +2,6 @@
2#define _M68K_IRQFLAGS_H 2#define _M68K_IRQFLAGS_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#ifdef CONFIG_MMU
6#include <linux/preempt_mask.h>
7#endif
8#include <linux/preempt.h> 5#include <linux/preempt.h>
9#include <asm/thread_info.h> 6#include <asm/thread_info.h>
10#include <asm/entry.h> 7#include <asm/entry.h>
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index b2f04aee46ec..6a94cdd0c830 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -10,10 +10,10 @@
10#include <linux/ptrace.h> 10#include <linux/ptrace.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uaccess.h>
13 14
14#include <asm/setup.h> 15#include <asm/setup.h>
15#include <asm/traps.h> 16#include <asm/traps.h>
16#include <asm/uaccess.h>
17#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
18 18
19extern void die_if_kernel(char *, struct pt_regs *, long); 19extern void die_if_kernel(char *, struct pt_regs *, long);
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
81 * If we're in an interrupt or have no user 81 * If we're in an interrupt or have no user
82 * context, we must not take the fault.. 82 * context, we must not take the fault..
83 */ 83 */
84 if (in_atomic() || !mm) 84 if (faulthandler_disabled() || !mm)
85 goto no_context; 85 goto no_context;
86 86
87 if (user_mode(regs)) 87 if (user_mode(regs))
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2de5dc695a87..f57edca63609 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
105 105
106 mm = tsk->mm; 106 mm = tsk->mm;
107 107
108 if (in_atomic() || !mm) 108 if (faulthandler_disabled() || !mm)
109 goto no_context; 109 goto no_context;
110 110
111 if (user_mode(regs)) 111 if (user_mode(regs))
diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c
index d71f621a2c0b..807f1b1c4e65 100644
--- a/arch/metag/mm/highmem.c
+++ b/arch/metag/mm/highmem.c
@@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page)
43 unsigned long vaddr; 43 unsigned long vaddr;
44 int type; 44 int type;
45 45
46 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 46 preempt_disable();
47 pagefault_disable(); 47 pagefault_disable();
48 if (!PageHighMem(page)) 48 if (!PageHighMem(page))
49 return page_address(page); 49 return page_address(page);
@@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr)
82 } 82 }
83 83
84 pagefault_enable(); 84 pagefault_enable();
85 preempt_enable();
85} 86}
86EXPORT_SYMBOL(__kunmap_atomic); 87EXPORT_SYMBOL(__kunmap_atomic);
87 88
@@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
95 unsigned long vaddr; 96 unsigned long vaddr;
96 int type; 97 int type;
97 98
99 preempt_disable();
98 pagefault_disable(); 100 pagefault_disable();
99 101
100 type = kmap_atomic_idx_push(); 102 type = kmap_atomic_idx_push();
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 62942fd12672..331b0d35f89c 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -178,7 +178,8 @@ extern long __user_bad(void);
178 * @x: Variable to store result. 178 * @x: Variable to store result.
179 * @ptr: Source address, in user space. 179 * @ptr: Source address, in user space.
180 * 180 *
181 * Context: User context only. This function may sleep. 181 * Context: User context only. This function may sleep if pagefaults are
182 * enabled.
182 * 183 *
183 * This macro copies a single simple variable from user space to kernel 184 * This macro copies a single simple variable from user space to kernel
184 * space. It supports simple types like char and int, but not larger 185 * space. It supports simple types like char and int, but not larger
@@ -290,7 +291,8 @@ extern long __user_bad(void);
290 * @x: Value to copy to user space. 291 * @x: Value to copy to user space.
291 * @ptr: Destination address, in user space. 292 * @ptr: Destination address, in user space.
292 * 293 *
293 * Context: User context only. This function may sleep. 294 * Context: User context only. This function may sleep if pagefaults are
295 * enabled.
294 * 296 *
295 * This macro copies a single simple value from kernel space to user 297 * This macro copies a single simple value from kernel space to user
296 * space. It supports simple types like char and int, but not larger 298 * space. It supports simple types like char and int, but not larger
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d46a5ebb7570..177dfc003643 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
107 if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11) 107 if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
108 is_write = 0; 108 is_write = 0;
109 109
110 if (unlikely(in_atomic() || !mm)) { 110 if (unlikely(faulthandler_disabled() || !mm)) {
111 if (kernel_mode(regs)) 111 if (kernel_mode(regs))
112 goto bad_area_nosemaphore; 112 goto bad_area_nosemaphore;
113 113
114 /* in_atomic() in user mode is really bad, 114 /* faulthandler_disabled() in user mode is really bad,
115 as is current->mm == NULL. */ 115 as is current->mm == NULL. */
116 pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n", 116 pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n",
117 mm); 117 mm);
118 pr_emerg("r15 = %lx MSR = %lx\n", 118 pr_emerg("r15 = %lx MSR = %lx\n",
119 regs->r15, regs->msr); 119 regs->r15, regs->msr);
120 die("Weird page fault", regs, SIGSEGV); 120 die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c
index 5a92576fad92..2fcc5a52d84d 100644
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
37 unsigned long vaddr; 37 unsigned long vaddr;
38 int idx, type; 38 int idx, type;
39 39
40 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 40 preempt_disable();
41 pagefault_disable(); 41 pagefault_disable();
42 if (!PageHighMem(page)) 42 if (!PageHighMem(page))
43 return page_address(page); 43 return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr)
63 63
64 if (vaddr < __fix_to_virt(FIX_KMAP_END)) { 64 if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
65 pagefault_enable(); 65 pagefault_enable();
66 preempt_enable();
66 return; 67 return;
67 } 68 }
68 69
@@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr)
84#endif 85#endif
85 kmap_atomic_idx_pop(); 86 kmap_atomic_idx_pop();
86 pagefault_enable(); 87 pagefault_enable();
88 preempt_enable();
87} 89}
88EXPORT_SYMBOL(__kunmap_atomic); 90EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 3e307ec2afba..7afda4150a59 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -15,7 +15,7 @@
15#define topology_physical_package_id(cpu) (cpu_data[cpu].package) 15#define topology_physical_package_id(cpu) (cpu_data[cpu].package)
16#define topology_core_id(cpu) (cpu_data[cpu].core) 16#define topology_core_id(cpu) (cpu_data[cpu].core)
17#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 17#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
18#define topology_thread_cpumask(cpu) (&cpu_sibling_map[cpu]) 18#define topology_sibling_cpumask(cpu) (&cpu_sibling_map[cpu])
19#endif 19#endif
20 20
21#endif /* __ASM_TOPOLOGY_H */ 21#endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index bf8b32450ef6..9722357d2854 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -103,7 +103,8 @@ extern u64 __ua_limit;
103 * @addr: User space pointer to start of block to check 103 * @addr: User space pointer to start of block to check
104 * @size: Size of block to check 104 * @size: Size of block to check
105 * 105 *
106 * Context: User context only. This function may sleep. 106 * Context: User context only. This function may sleep if pagefaults are
107 * enabled.
107 * 108 *
108 * Checks if a pointer to a block of memory in user space is valid. 109 * Checks if a pointer to a block of memory in user space is valid.
109 * 110 *
@@ -138,7 +139,8 @@ extern u64 __ua_limit;
138 * @x: Value to copy to user space. 139 * @x: Value to copy to user space.
139 * @ptr: Destination address, in user space. 140 * @ptr: Destination address, in user space.
140 * 141 *
141 * Context: User context only. This function may sleep. 142 * Context: User context only. This function may sleep if pagefaults are
143 * enabled.
142 * 144 *
143 * This macro copies a single simple value from kernel space to user 145 * This macro copies a single simple value from kernel space to user
144 * space. It supports simple types like char and int, but not larger 146 * space. It supports simple types like char and int, but not larger
@@ -157,7 +159,8 @@ extern u64 __ua_limit;
157 * @x: Variable to store result. 159 * @x: Variable to store result.
158 * @ptr: Source address, in user space. 160 * @ptr: Source address, in user space.
159 * 161 *
160 * Context: User context only. This function may sleep. 162 * Context: User context only. This function may sleep if pagefaults are
163 * enabled.
161 * 164 *
162 * This macro copies a single simple variable from user space to kernel 165 * This macro copies a single simple variable from user space to kernel
163 * space. It supports simple types like char and int, but not larger 166 * space. It supports simple types like char and int, but not larger
@@ -177,7 +180,8 @@ extern u64 __ua_limit;
177 * @x: Value to copy to user space. 180 * @x: Value to copy to user space.
178 * @ptr: Destination address, in user space. 181 * @ptr: Destination address, in user space.
179 * 182 *
180 * Context: User context only. This function may sleep. 183 * Context: User context only. This function may sleep if pagefaults are
184 * enabled.
181 * 185 *
182 * This macro copies a single simple value from kernel space to user 186 * This macro copies a single simple value from kernel space to user
183 * space. It supports simple types like char and int, but not larger 187 * space. It supports simple types like char and int, but not larger
@@ -199,7 +203,8 @@ extern u64 __ua_limit;
199 * @x: Variable to store result. 203 * @x: Variable to store result.
200 * @ptr: Source address, in user space. 204 * @ptr: Source address, in user space.
201 * 205 *
202 * Context: User context only. This function may sleep. 206 * Context: User context only. This function may sleep if pagefaults are
207 * enabled.
203 * 208 *
204 * This macro copies a single simple variable from user space to kernel 209 * This macro copies a single simple variable from user space to kernel
205 * space. It supports simple types like char and int, but not larger 210 * space. It supports simple types like char and int, but not larger
@@ -498,7 +503,8 @@ extern void __put_user_unknown(void);
498 * @x: Value to copy to user space. 503 * @x: Value to copy to user space.
499 * @ptr: Destination address, in user space. 504 * @ptr: Destination address, in user space.
500 * 505 *
501 * Context: User context only. This function may sleep. 506 * Context: User context only. This function may sleep if pagefaults are
507 * enabled.
502 * 508 *
503 * This macro copies a single simple value from kernel space to user 509 * This macro copies a single simple value from kernel space to user
504 * space. It supports simple types like char and int, but not larger 510 * space. It supports simple types like char and int, but not larger
@@ -517,7 +523,8 @@ extern void __put_user_unknown(void);
517 * @x: Variable to store result. 523 * @x: Variable to store result.
518 * @ptr: Source address, in user space. 524 * @ptr: Source address, in user space.
519 * 525 *
520 * Context: User context only. This function may sleep. 526 * Context: User context only. This function may sleep if pagefaults are
527 * enabled.
521 * 528 *
522 * This macro copies a single simple variable from user space to kernel 529 * This macro copies a single simple variable from user space to kernel
523 * space. It supports simple types like char and int, but not larger 530 * space. It supports simple types like char and int, but not larger
@@ -537,7 +544,8 @@ extern void __put_user_unknown(void);
537 * @x: Value to copy to user space. 544 * @x: Value to copy to user space.
538 * @ptr: Destination address, in user space. 545 * @ptr: Destination address, in user space.
539 * 546 *
540 * Context: User context only. This function may sleep. 547 * Context: User context only. This function may sleep if pagefaults are
548 * enabled.
541 * 549 *
542 * This macro copies a single simple value from kernel space to user 550 * This macro copies a single simple value from kernel space to user
543 * space. It supports simple types like char and int, but not larger 551 * space. It supports simple types like char and int, but not larger
@@ -559,7 +567,8 @@ extern void __put_user_unknown(void);
559 * @x: Variable to store result. 567 * @x: Variable to store result.
560 * @ptr: Source address, in user space. 568 * @ptr: Source address, in user space.
561 * 569 *
562 * Context: User context only. This function may sleep. 570 * Context: User context only. This function may sleep if pagefaults are
571 * enabled.
563 * 572 *
564 * This macro copies a single simple variable from user space to kernel 573 * This macro copies a single simple variable from user space to kernel
565 * space. It supports simple types like char and int, but not larger 574 * space. It supports simple types like char and int, but not larger
@@ -815,7 +824,8 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n);
815 * @from: Source address, in kernel space. 824 * @from: Source address, in kernel space.
816 * @n: Number of bytes to copy. 825 * @n: Number of bytes to copy.
817 * 826 *
818 * Context: User context only. This function may sleep. 827 * Context: User context only. This function may sleep if pagefaults are
828 * enabled.
819 * 829 *
820 * Copy data from kernel space to user space. Caller must check 830 * Copy data from kernel space to user space. Caller must check
821 * the specified block with access_ok() before calling this function. 831 * the specified block with access_ok() before calling this function.
@@ -888,7 +898,8 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
888 * @from: Source address, in kernel space. 898 * @from: Source address, in kernel space.
889 * @n: Number of bytes to copy. 899 * @n: Number of bytes to copy.
890 * 900 *
891 * Context: User context only. This function may sleep. 901 * Context: User context only. This function may sleep if pagefaults are
902 * enabled.
892 * 903 *
893 * Copy data from kernel space to user space. 904 * Copy data from kernel space to user space.
894 * 905 *
@@ -1075,7 +1086,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
1075 * @from: Source address, in user space. 1086 * @from: Source address, in user space.
1076 * @n: Number of bytes to copy. 1087 * @n: Number of bytes to copy.
1077 * 1088 *
1078 * Context: User context only. This function may sleep. 1089 * Context: User context only. This function may sleep if pagefaults are
1090 * enabled.
1079 * 1091 *
1080 * Copy data from user space to kernel space. Caller must check 1092 * Copy data from user space to kernel space. Caller must check
1081 * the specified block with access_ok() before calling this function. 1093 * the specified block with access_ok() before calling this function.
@@ -1107,7 +1119,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
1107 * @from: Source address, in user space. 1119 * @from: Source address, in user space.
1108 * @n: Number of bytes to copy. 1120 * @n: Number of bytes to copy.
1109 * 1121 *
1110 * Context: User context only. This function may sleep. 1122 * Context: User context only. This function may sleep if pagefaults are
1123 * enabled.
1111 * 1124 *
1112 * Copy data from user space to kernel space. 1125 * Copy data from user space to kernel space.
1113 * 1126 *
@@ -1329,7 +1342,8 @@ strncpy_from_user(char *__to, const char __user *__from, long __len)
1329 * strlen_user: - Get the size of a string in user space. 1342 * strlen_user: - Get the size of a string in user space.
1330 * @str: The string to measure. 1343 * @str: The string to measure.
1331 * 1344 *
1332 * Context: User context only. This function may sleep. 1345 * Context: User context only. This function may sleep if pagefaults are
1346 * enabled.
1333 * 1347 *
1334 * Get the size of a NUL-terminated string in user space. 1348 * Get the size of a NUL-terminated string in user space.
1335 * 1349 *
@@ -1398,7 +1412,8 @@ static inline long __strnlen_user(const char __user *s, long n)
1398 * strnlen_user: - Get the size of a string in user space. 1412 * strnlen_user: - Get the size of a string in user space.
1399 * @str: The string to measure. 1413 * @str: The string to measure.
1400 * 1414 *
1401 * Context: User context only. This function may sleep. 1415 * Context: User context only. This function may sleep if pagefaults are
1416 * enabled.
1402 * 1417 *
1403 * Get the size of a NUL-terminated string in user space. 1418 * Get the size of a NUL-terminated string in user space.
1404 * 1419 *
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h
index 06805e09bcd3..0b85f827cd18 100644
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -28,12 +28,7 @@ extern void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
28extern int fpcsr_pending(unsigned int __user *fpcsr); 28extern int fpcsr_pending(unsigned int __user *fpcsr);
29 29
30/* Make sure we will not lose FPU ownership */ 30/* Make sure we will not lose FPU ownership */
31#ifdef CONFIG_PREEMPT 31#define lock_fpu_owner() ({ preempt_disable(); pagefault_disable(); })
32#define lock_fpu_owner() preempt_disable() 32#define unlock_fpu_owner() ({ pagefault_enable(); preempt_enable(); })
33#define unlock_fpu_owner() preempt_enable()
34#else
35#define lock_fpu_owner() pagefault_disable()
36#define unlock_fpu_owner() pagefault_enable()
37#endif
38 33
39#endif /* __SIGNAL_COMMON_H */ 34#endif /* __SIGNAL_COMMON_H */
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ff8637e530d..36c0f26fac6b 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -21,10 +21,10 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/kprobes.h> 22#include <linux/kprobes.h>
23#include <linux/perf_event.h> 23#include <linux/perf_event.h>
24#include <linux/uaccess.h>
24 25
25#include <asm/branch.h> 26#include <asm/branch.h>
26#include <asm/mmu_context.h> 27#include <asm/mmu_context.h>
27#include <asm/uaccess.h>
28#include <asm/ptrace.h> 28#include <asm/ptrace.h>
29#include <asm/highmem.h> /* For VMALLOC_END */ 29#include <asm/highmem.h> /* For VMALLOC_END */
30#include <linux/kdebug.h> 30#include <linux/kdebug.h>
@@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
94 * If we're in an interrupt or have no user 94 * If we're in an interrupt or have no user
95 * context, we must not take the fault.. 95 * context, we must not take the fault..
96 */ 96 */
97 if (in_atomic() || !mm) 97 if (faulthandler_disabled() || !mm)
98 goto bad_area_nosemaphore; 98 goto bad_area_nosemaphore;
99 99
100 if (user_mode(regs)) 100 if (user_mode(regs))
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index da815d295239..11661cbc11a8 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page)
47 unsigned long vaddr; 47 unsigned long vaddr;
48 int idx, type; 48 int idx, type;
49 49
50 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 50 preempt_disable();
51 pagefault_disable(); 51 pagefault_disable();
52 if (!PageHighMem(page)) 52 if (!PageHighMem(page))
53 return page_address(page); 53 return page_address(page);
@@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr)
72 72
73 if (vaddr < FIXADDR_START) { // FIXME 73 if (vaddr < FIXADDR_START) { // FIXME
74 pagefault_enable(); 74 pagefault_enable();
75 preempt_enable();
75 return; 76 return;
76 } 77 }
77 78
@@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr)
92#endif 93#endif
93 kmap_atomic_idx_pop(); 94 kmap_atomic_idx_pop();
94 pagefault_enable(); 95 pagefault_enable();
96 preempt_enable();
95} 97}
96EXPORT_SYMBOL(__kunmap_atomic); 98EXPORT_SYMBOL(__kunmap_atomic);
97 99
@@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
104 unsigned long vaddr; 106 unsigned long vaddr;
105 int idx, type; 107 int idx, type;
106 108
109 preempt_disable();
107 pagefault_disable(); 110 pagefault_disable();
108 111
109 type = kmap_atomic_idx_push(); 112 type = kmap_atomic_idx_push();
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index faa5c9822ecc..198a3147dd7d 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -90,6 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
90 90
91 BUG_ON(Page_dcache_dirty(page)); 91 BUG_ON(Page_dcache_dirty(page));
92 92
93 preempt_disable();
93 pagefault_disable(); 94 pagefault_disable();
94 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); 95 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
95 idx += in_interrupt() ? FIX_N_COLOURS : 0; 96 idx += in_interrupt() ? FIX_N_COLOURS : 0;
@@ -152,6 +153,7 @@ void kunmap_coherent(void)
152 write_c0_entryhi(old_ctx); 153 write_c0_entryhi(old_ctx);
153 local_irq_restore(flags); 154 local_irq_restore(flags);
154 pagefault_enable(); 155 pagefault_enable();
156 preempt_enable();
155} 157}
156 158
157void copy_user_highpage(struct page *to, struct page *from, 159void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 2fbbe4d920aa..1ddea5afba09 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page)
75 unsigned long vaddr; 75 unsigned long vaddr;
76 int idx, type; 76 int idx, type;
77 77
78 preempt_disable();
78 pagefault_disable(); 79 pagefault_disable();
79 if (page < highmem_start_page) 80 if (page < highmem_start_page)
80 return page_address(page); 81 return page_address(page);
@@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
98 99
99 if (vaddr < FIXADDR_START) { /* FIXME */ 100 if (vaddr < FIXADDR_START) { /* FIXME */
100 pagefault_enable(); 101 pagefault_enable();
102 preempt_enable();
101 return; 103 return;
102 } 104 }
103 105
@@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
122 124
123 kmap_atomic_idx_pop(); 125 kmap_atomic_idx_pop();
124 pagefault_enable(); 126 pagefault_enable();
127 preempt_enable();
125} 128}
126#endif /* __KERNEL__ */ 129#endif /* __KERNEL__ */
127 130
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 0c2cc5d39c8e..4a1d181ed32f 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -23,8 +23,8 @@
23#include <linux/interrupt.h> 23#include <linux/interrupt.h>
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/vt_kern.h> /* For unblank_screen() */ 25#include <linux/vt_kern.h> /* For unblank_screen() */
26#include <linux/uaccess.h>
26 27
27#include <asm/uaccess.h>
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29#include <asm/hardirq.h> 29#include <asm/hardirq.h>
30#include <asm/cpu-regs.h> 30#include <asm/cpu-regs.h>
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
168 * If we're in an interrupt or have no user 168 * If we're in an interrupt or have no user
169 * context, we must not take the fault.. 169 * context, we must not take the fault..
170 */ 170 */
171 if (in_atomic() || !mm) 171 if (faulthandler_disabled() || !mm)
172 goto no_context; 172 goto no_context;
173 173
174 if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) 174 if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 0c9b6afe69e9..b51878b0c6b8 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
77 * If we're in an interrupt or have no user 77 * If we're in an interrupt or have no user
78 * context, we must not take the fault.. 78 * context, we must not take the fault..
79 */ 79 */
80 if (in_atomic() || !mm) 80 if (faulthandler_disabled() || !mm)
81 goto bad_area_nosemaphore; 81 goto bad_area_nosemaphore;
82 82
83 if (user_mode(regs)) 83 if (user_mode(regs))
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index de65f66ea64e..ec2df4bab302 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -142,6 +142,7 @@ static inline void kunmap(struct page *page)
142 142
143static inline void *kmap_atomic(struct page *page) 143static inline void *kmap_atomic(struct page *page)
144{ 144{
145 preempt_disable();
145 pagefault_disable(); 146 pagefault_disable();
146 return page_address(page); 147 return page_address(page);
147} 148}
@@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr)
150{ 151{
151 flush_kernel_dcache_page_addr(addr); 152 flush_kernel_dcache_page_addr(addr);
152 pagefault_enable(); 153 pagefault_enable();
154 preempt_enable();
153} 155}
154 156
155#define kmap_atomic_prot(page, prot) kmap_atomic(page) 157#define kmap_atomic_prot(page, prot) kmap_atomic(page)
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 47ee620d15d2..6548fd1d2e62 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -26,9 +26,9 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/bug.h> 27#include <linux/bug.h>
28#include <linux/ratelimit.h> 28#include <linux/ratelimit.h>
29#include <linux/uaccess.h>
29 30
30#include <asm/assembly.h> 31#include <asm/assembly.h>
31#include <asm/uaccess.h>
32#include <asm/io.h> 32#include <asm/io.h>
33#include <asm/irq.h> 33#include <asm/irq.h>
34#include <asm/traps.h> 34#include <asm/traps.h>
@@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
800 * unless pagefault_disable() was called before. 800 * unless pagefault_disable() was called before.
801 */ 801 */
802 802
803 if (fault_space == 0 && !in_atomic()) 803 if (fault_space == 0 && !faulthandler_disabled())
804 { 804 {
805 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC); 805 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
806 parisc_terminate("Kernel Fault", regs, code, fault_address); 806 parisc_terminate("Kernel Fault", regs, code, fault_address);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e5120e653240..15503adddf4f 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -15,8 +15,8 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/uaccess.h>
18 19
19#include <asm/uaccess.h>
20#include <asm/traps.h> 20#include <asm/traps.h>
21 21
22/* Various important other fields */ 22/* Various important other fields */
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
207 int fault; 207 int fault;
208 unsigned int flags; 208 unsigned int flags;
209 209
210 if (in_atomic()) 210 if (pagefault_disabled())
211 goto no_context; 211 goto no_context;
212 212
213 tsk = current; 213 tsk = current;
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 5f1048eaa5b6..8b3b46b7b0f2 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,7 +87,7 @@ static inline int prrn_is_enabled(void)
87#include <asm/smp.h> 87#include <asm/smp.h>
88 88
89#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu)) 89#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu))
90#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 90#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
91#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 91#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
92#define topology_core_id(cpu) (cpu_to_core_id(cpu)) 92#define topology_core_id(cpu) (cpu_to_core_id(cpu))
93#endif 93#endif
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index 3cf529ceec5b..ac93a3bd2730 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -27,11 +27,11 @@ int enter_vmx_usercopy(void)
27 if (in_interrupt()) 27 if (in_interrupt())
28 return 0; 28 return 0;
29 29
30 /* This acts as preempt_disable() as well and will make 30 preempt_disable();
31 * enable_kernel_altivec(). We need to disable page faults 31 /*
32 * as they can call schedule and thus make us lose the VMX 32 * We need to disable page faults as they can call schedule and
33 * context. So on page faults, we just fail which will cause 33 * thus make us lose the VMX context. So on page faults, we just
34 * a fallback to the normal non-vmx copy. 34 * fail which will cause a fallback to the normal non-vmx copy.
35 */ 35 */
36 pagefault_disable(); 36 pagefault_disable();
37 37
@@ -47,6 +47,7 @@ int enter_vmx_usercopy(void)
47int exit_vmx_usercopy(void) 47int exit_vmx_usercopy(void)
48{ 48{
49 pagefault_enable(); 49 pagefault_enable();
50 preempt_enable();
50 return 0; 51 return 0;
51} 52}
52 53
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868d2aa7..6d535973b200 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,13 +33,13 @@
33#include <linux/ratelimit.h> 33#include <linux/ratelimit.h>
34#include <linux/context_tracking.h> 34#include <linux/context_tracking.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/uaccess.h>
36 37
37#include <asm/firmware.h> 38#include <asm/firmware.h>
38#include <asm/page.h> 39#include <asm/page.h>
39#include <asm/pgtable.h> 40#include <asm/pgtable.h>
40#include <asm/mmu.h> 41#include <asm/mmu.h>
41#include <asm/mmu_context.h> 42#include <asm/mmu_context.h>
42#include <asm/uaccess.h>
43#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
44#include <asm/siginfo.h> 44#include <asm/siginfo.h>
45#include <asm/debug.h> 45#include <asm/debug.h>
@@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
272 if (!arch_irq_disabled_regs(regs)) 272 if (!arch_irq_disabled_regs(regs))
273 local_irq_enable(); 273 local_irq_enable();
274 274
275 if (in_atomic() || mm == NULL) { 275 if (faulthandler_disabled() || mm == NULL) {
276 if (!user_mode(regs)) { 276 if (!user_mode(regs)) {
277 rc = SIGSEGV; 277 rc = SIGSEGV;
278 goto bail; 278 goto bail;
279 } 279 }
280 /* in_atomic() in user mode is really bad, 280 /* faulthandler_disabled() in user mode is really bad,
281 as is current->mm == NULL. */ 281 as is current->mm == NULL. */
282 printk(KERN_EMERG "Page fault in user mode with " 282 printk(KERN_EMERG "Page fault in user mode with "
283 "in_atomic() = %d mm = %p\n", in_atomic(), mm); 283 "faulthandler_disabled() = %d mm = %p\n",
284 faulthandler_disabled(), mm);
284 printk(KERN_EMERG "NIP = %lx MSR = %lx\n", 285 printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
285 regs->nip, regs->msr); 286 regs->nip, regs->msr);
286 die("Weird page fault", regs, SIGSEGV); 287 die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index e7450bdbe83a..e292c8a60952 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
34 unsigned long vaddr; 34 unsigned long vaddr;
35 int idx, type; 35 int idx, type;
36 36
37 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 37 preempt_disable();
38 pagefault_disable(); 38 pagefault_disable();
39 if (!PageHighMem(page)) 39 if (!PageHighMem(page))
40 return page_address(page); 40 return page_address(page);
@@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr)
59 59
60 if (vaddr < __fix_to_virt(FIX_KMAP_END)) { 60 if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
61 pagefault_enable(); 61 pagefault_enable();
62 preempt_enable();
62 return; 63 return;
63 } 64 }
64 65
@@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr)
82 83
83 kmap_atomic_idx_pop(); 84 kmap_atomic_idx_pop();
84 pagefault_enable(); 85 pagefault_enable();
86 preempt_enable();
85} 87}
86EXPORT_SYMBOL(__kunmap_atomic); 88EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index cbd3d069897f..723a099f6be3 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -217,7 +217,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
217static int mm_is_core_local(struct mm_struct *mm) 217static int mm_is_core_local(struct mm_struct *mm)
218{ 218{
219 return cpumask_subset(mm_cpumask(mm), 219 return cpumask_subset(mm_cpumask(mm),
220 topology_thread_cpumask(smp_processor_id())); 220 topology_sibling_cpumask(smp_processor_id()));
221} 221}
222 222
223struct tlb_flush_param { 223struct tlb_flush_param {
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index b1453a2ae1ca..4990f6c66288 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,7 +22,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
22 22
23#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id) 23#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
24#define topology_thread_id(cpu) (per_cpu(cpu_topology, cpu).thread_id) 24#define topology_thread_id(cpu) (per_cpu(cpu_topology, cpu).thread_id)
25#define topology_thread_cpumask(cpu) (&per_cpu(cpu_topology, cpu).thread_mask) 25#define topology_sibling_cpumask(cpu) \
26 (&per_cpu(cpu_topology, cpu).thread_mask)
26#define topology_core_id(cpu) (per_cpu(cpu_topology, cpu).core_id) 27#define topology_core_id(cpu) (per_cpu(cpu_topology, cpu).core_id)
27#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask) 28#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask)
28#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id) 29#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id)
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index d64a7a62164f..9dd4cc47ddc7 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -98,7 +98,8 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
98 * @from: Source address, in user space. 98 * @from: Source address, in user space.
99 * @n: Number of bytes to copy. 99 * @n: Number of bytes to copy.
100 * 100 *
101 * Context: User context only. This function may sleep. 101 * Context: User context only. This function may sleep if pagefaults are
102 * enabled.
102 * 103 *
103 * Copy data from user space to kernel space. Caller must check 104 * Copy data from user space to kernel space. Caller must check
104 * the specified block with access_ok() before calling this function. 105 * the specified block with access_ok() before calling this function.
@@ -118,7 +119,8 @@ unsigned long __must_check __copy_from_user(void *to, const void __user *from,
118 * @from: Source address, in kernel space. 119 * @from: Source address, in kernel space.
119 * @n: Number of bytes to copy. 120 * @n: Number of bytes to copy.
120 * 121 *
121 * Context: User context only. This function may sleep. 122 * Context: User context only. This function may sleep if pagefaults are
123 * enabled.
122 * 124 *
123 * Copy data from kernel space to user space. Caller must check 125 * Copy data from kernel space to user space. Caller must check
124 * the specified block with access_ok() before calling this function. 126 * the specified block with access_ok() before calling this function.
@@ -264,7 +266,8 @@ int __get_user_bad(void) __attribute__((noreturn));
264 * @from: Source address, in kernel space. 266 * @from: Source address, in kernel space.
265 * @n: Number of bytes to copy. 267 * @n: Number of bytes to copy.
266 * 268 *
267 * Context: User context only. This function may sleep. 269 * Context: User context only. This function may sleep if pagefaults are
270 * enabled.
268 * 271 *
269 * Copy data from kernel space to user space. 272 * Copy data from kernel space to user space.
270 * 273 *
@@ -290,7 +293,8 @@ __compiletime_warning("copy_from_user() buffer size is not provably correct")
290 * @from: Source address, in user space. 293 * @from: Source address, in user space.
291 * @n: Number of bytes to copy. 294 * @n: Number of bytes to copy.
292 * 295 *
293 * Context: User context only. This function may sleep. 296 * Context: User context only. This function may sleep if pagefaults are
297 * enabled.
294 * 298 *
295 * Copy data from user space to kernel space. 299 * Copy data from user space to kernel space.
296 * 300 *
@@ -348,7 +352,8 @@ static inline unsigned long strnlen_user(const char __user *src, unsigned long n
348 * strlen_user: - Get the size of a string in user space. 352 * strlen_user: - Get the size of a string in user space.
349 * @str: The string to measure. 353 * @str: The string to measure.
350 * 354 *
351 * Context: User context only. This function may sleep. 355 * Context: User context only. This function may sleep if pagefaults are
356 * enabled.
352 * 357 *
353 * Get the size of a NUL-terminated string in user space. 358 * Get the size of a NUL-terminated string in user space.
354 * 359 *
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 76515bcea2f1..4c8f5d7f9c23 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
399 * user context. 399 * user context.
400 */ 400 */
401 fault = VM_FAULT_BADCONTEXT; 401 fault = VM_FAULT_BADCONTEXT;
402 if (unlikely(!user_space_fault(regs) || in_atomic() || !mm)) 402 if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
403 goto out; 403 goto out;
404 404
405 address = trans_exc_code & __FAIL_ADDR_MASK; 405 address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index ab66ddde777b..20a3591225cc 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -36,7 +36,8 @@
36 * @addr: User space pointer to start of block to check 36 * @addr: User space pointer to start of block to check
37 * @size: Size of block to check 37 * @size: Size of block to check
38 * 38 *
39 * Context: User context only. This function may sleep. 39 * Context: User context only. This function may sleep if pagefaults are
40 * enabled.
40 * 41 *
41 * Checks if a pointer to a block of memory in user space is valid. 42 * Checks if a pointer to a block of memory in user space is valid.
42 * 43 *
@@ -61,7 +62,8 @@
61 * @x: Value to copy to user space. 62 * @x: Value to copy to user space.
62 * @ptr: Destination address, in user space. 63 * @ptr: Destination address, in user space.
63 * 64 *
64 * Context: User context only. This function may sleep. 65 * Context: User context only. This function may sleep if pagefaults are
66 * enabled.
65 * 67 *
66 * This macro copies a single simple value from kernel space to user 68 * This macro copies a single simple value from kernel space to user
67 * space. It supports simple types like char and int, but not larger 69 * space. It supports simple types like char and int, but not larger
@@ -79,7 +81,8 @@
79 * @x: Variable to store result. 81 * @x: Variable to store result.
80 * @ptr: Source address, in user space. 82 * @ptr: Source address, in user space.
81 * 83 *
82 * Context: User context only. This function may sleep. 84 * Context: User context only. This function may sleep if pagefaults are
85 * enabled.
83 * 86 *
84 * This macro copies a single simple variable from user space to kernel 87 * This macro copies a single simple variable from user space to kernel
85 * space. It supports simple types like char and int, but not larger 88 * space. It supports simple types like char and int, but not larger
@@ -98,7 +101,8 @@
98 * @x: Value to copy to user space. 101 * @x: Value to copy to user space.
99 * @ptr: Destination address, in user space. 102 * @ptr: Destination address, in user space.
100 * 103 *
101 * Context: User context only. This function may sleep. 104 * Context: User context only. This function may sleep if pagefaults are
105 * enabled.
102 * 106 *
103 * This macro copies a single simple value from kernel space to user 107 * This macro copies a single simple value from kernel space to user
104 * space. It supports simple types like char and int, but not larger 108 * space. It supports simple types like char and int, but not larger
@@ -119,7 +123,8 @@
119 * @x: Variable to store result. 123 * @x: Variable to store result.
120 * @ptr: Source address, in user space. 124 * @ptr: Source address, in user space.
121 * 125 *
122 * Context: User context only. This function may sleep. 126 * Context: User context only. This function may sleep if pagefaults are
127 * enabled.
123 * 128 *
124 * This macro copies a single simple variable from user space to kernel 129 * This macro copies a single simple variable from user space to kernel
125 * space. It supports simple types like char and int, but not larger 130 * space. It supports simple types like char and int, but not larger
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 6860beb2a280..37a6c2e0e969 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -34,6 +34,7 @@
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/types.h> 35#include <linux/types.h>
36#include <linux/ptrace.h> 36#include <linux/ptrace.h>
37#include <linux/uaccess.h>
37 38
38/* 39/*
39 * This routine handles page faults. It determines the address, 40 * This routine handles page faults. It determines the address,
@@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
73 * If we're in an interrupt or have no user 74 * If we're in an interrupt or have no user
74 * context, we must not take the fault.. 75 * context, we must not take the fault..
75 */ 76 */
76 if (in_atomic() || !mm) 77 if (pagefault_disabled() || !mm)
77 goto bad_area_nosemaphore; 78 goto bad_area_nosemaphore;
78 79
79 if (user_mode(regs)) 80 if (user_mode(regs))
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index a58fec9b55e0..79d8276377d1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -17,6 +17,7 @@
17#include <linux/kprobes.h> 17#include <linux/kprobes.h>
18#include <linux/perf_event.h> 18#include <linux/perf_event.h>
19#include <linux/kdebug.h> 19#include <linux/kdebug.h>
20#include <linux/uaccess.h>
20#include <asm/io_trapped.h> 21#include <asm/io_trapped.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
@@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
438 439
439 /* 440 /*
440 * If we're in an interrupt, have no user context or are running 441 * If we're in an interrupt, have no user context or are running
441 * in an atomic region then we must not take the fault: 442 * with pagefaults disabled then we must not take the fault:
442 */ 443 */
443 if (unlikely(in_atomic() || !mm)) { 444 if (unlikely(faulthandler_disabled() || !mm)) {
444 bad_area_nosemaphore(regs, error_code, address); 445 bad_area_nosemaphore(regs, error_code, address);
445 return; 446 return;
446 } 447 }
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index d1761df5cca6..01d17046225a 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -41,7 +41,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
41#define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id) 41#define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id)
42#define topology_core_id(cpu) (cpu_data(cpu).core_id) 42#define topology_core_id(cpu) (cpu_data(cpu).core_id)
43#define topology_core_cpumask(cpu) (&cpu_core_sib_map[cpu]) 43#define topology_core_cpumask(cpu) (&cpu_core_sib_map[cpu])
44#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) 44#define topology_sibling_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
45#endif /* CONFIG_SMP */ 45#endif /* CONFIG_SMP */
46 46
47extern cpumask_t cpu_core_map[NR_CPUS]; 47extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 70d817154fe8..c399e7b3b035 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -21,6 +21,7 @@
21#include <linux/perf_event.h> 21#include <linux/perf_event.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/kdebug.h> 23#include <linux/kdebug.h>
24#include <linux/uaccess.h>
24 25
25#include <asm/page.h> 26#include <asm/page.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -29,7 +30,6 @@
29#include <asm/setup.h> 30#include <asm/setup.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
31#include <asm/traps.h> 32#include <asm/traps.h>
32#include <asm/uaccess.h>
33 33
34#include "mm_32.h" 34#include "mm_32.h"
35 35
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
196 * If we're in an interrupt or have no user 196 * If we're in an interrupt or have no user
197 * context, we must not take the fault.. 197 * context, we must not take the fault..
198 */ 198 */
199 if (in_atomic() || !mm) 199 if (pagefault_disabled() || !mm)
200 goto no_context; 200 goto no_context;
201 201
202 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 202 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 479823249429..e9268ea1a68d 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -22,12 +22,12 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/percpu.h> 23#include <linux/percpu.h>
24#include <linux/context_tracking.h> 24#include <linux/context_tracking.h>
25#include <linux/uaccess.h>
25 26
26#include <asm/page.h> 27#include <asm/page.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/openprom.h> 29#include <asm/openprom.h>
29#include <asm/oplib.h> 30#include <asm/oplib.h>
30#include <asm/uaccess.h>
31#include <asm/asi.h> 31#include <asm/asi.h>
32#include <asm/lsu.h> 32#include <asm/lsu.h>
33#include <asm/sections.h> 33#include <asm/sections.h>
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
330 * If we're in an interrupt or have no user 330 * If we're in an interrupt or have no user
331 * context, we must not take the fault.. 331 * context, we must not take the fault..
332 */ 332 */
333 if (in_atomic() || !mm) 333 if (faulthandler_disabled() || !mm)
334 goto intr_or_no_mm; 334 goto intr_or_no_mm;
335 335
336 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 336 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 449f864f0cef..a454ec5ff07a 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page)
53 unsigned long vaddr; 53 unsigned long vaddr;
54 long idx, type; 54 long idx, type;
55 55
56 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 56 preempt_disable();
57 pagefault_disable(); 57 pagefault_disable();
58 if (!PageHighMem(page)) 58 if (!PageHighMem(page))
59 return page_address(page); 59 return page_address(page);
@@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr)
91 91
92 if (vaddr < FIXADDR_START) { // FIXME 92 if (vaddr < FIXADDR_START) { // FIXME
93 pagefault_enable(); 93 pagefault_enable();
94 preempt_enable();
94 return; 95 return;
95 } 96 }
96 97
@@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr)
126 127
127 kmap_atomic_idx_pop(); 128 kmap_atomic_idx_pop();
128 pagefault_enable(); 129 pagefault_enable();
130 preempt_enable();
129} 131}
130EXPORT_SYMBOL(__kunmap_atomic); 132EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 559cb744112c..c5d08b89a96c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2738,7 +2738,7 @@ void hugetlb_setup(struct pt_regs *regs)
2738 struct mm_struct *mm = current->mm; 2738 struct mm_struct *mm = current->mm;
2739 struct tsb_config *tp; 2739 struct tsb_config *tp;
2740 2740
2741 if (in_atomic() || !mm) { 2741 if (faulthandler_disabled() || !mm) {
2742 const struct exception_table_entry *entry; 2742 const struct exception_table_entry *entry;
2743 2743
2744 entry = search_exception_tables(regs->tpc); 2744 entry = search_exception_tables(regs->tpc);
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 938311844233..76b0d0ebb244 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -55,7 +55,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
55#define topology_physical_package_id(cpu) ((void)(cpu), 0) 55#define topology_physical_package_id(cpu) ((void)(cpu), 0)
56#define topology_core_id(cpu) (cpu) 56#define topology_core_id(cpu) (cpu)
57#define topology_core_cpumask(cpu) ((void)(cpu), cpu_online_mask) 57#define topology_core_cpumask(cpu) ((void)(cpu), cpu_online_mask)
58#define topology_thread_cpumask(cpu) cpumask_of(cpu) 58#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
59#endif 59#endif
60 60
61#endif /* _ASM_TILE_TOPOLOGY_H */ 61#endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index f41cb53cf645..a33276bf5ca1 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -78,7 +78,8 @@ int __range_ok(unsigned long addr, unsigned long size);
78 * @addr: User space pointer to start of block to check 78 * @addr: User space pointer to start of block to check
79 * @size: Size of block to check 79 * @size: Size of block to check
80 * 80 *
81 * Context: User context only. This function may sleep. 81 * Context: User context only. This function may sleep if pagefaults are
82 * enabled.
82 * 83 *
83 * Checks if a pointer to a block of memory in user space is valid. 84 * Checks if a pointer to a block of memory in user space is valid.
84 * 85 *
@@ -192,7 +193,8 @@ extern int __get_user_bad(void)
192 * @x: Variable to store result. 193 * @x: Variable to store result.
193 * @ptr: Source address, in user space. 194 * @ptr: Source address, in user space.
194 * 195 *
195 * Context: User context only. This function may sleep. 196 * Context: User context only. This function may sleep if pagefaults are
197 * enabled.
196 * 198 *
197 * This macro copies a single simple variable from user space to kernel 199 * This macro copies a single simple variable from user space to kernel
198 * space. It supports simple types like char and int, but not larger 200 * space. It supports simple types like char and int, but not larger
@@ -274,7 +276,8 @@ extern int __put_user_bad(void)
274 * @x: Value to copy to user space. 276 * @x: Value to copy to user space.
275 * @ptr: Destination address, in user space. 277 * @ptr: Destination address, in user space.
276 * 278 *
277 * Context: User context only. This function may sleep. 279 * Context: User context only. This function may sleep if pagefaults are
280 * enabled.
278 * 281 *
279 * This macro copies a single simple value from kernel space to user 282 * This macro copies a single simple value from kernel space to user
280 * space. It supports simple types like char and int, but not larger 283 * space. It supports simple types like char and int, but not larger
@@ -330,7 +333,8 @@ extern int __put_user_bad(void)
330 * @from: Source address, in kernel space. 333 * @from: Source address, in kernel space.
331 * @n: Number of bytes to copy. 334 * @n: Number of bytes to copy.
332 * 335 *
333 * Context: User context only. This function may sleep. 336 * Context: User context only. This function may sleep if pagefaults are
337 * enabled.
334 * 338 *
335 * Copy data from kernel space to user space. Caller must check 339 * Copy data from kernel space to user space. Caller must check
336 * the specified block with access_ok() before calling this function. 340 * the specified block with access_ok() before calling this function.
@@ -366,7 +370,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
366 * @from: Source address, in user space. 370 * @from: Source address, in user space.
367 * @n: Number of bytes to copy. 371 * @n: Number of bytes to copy.
368 * 372 *
369 * Context: User context only. This function may sleep. 373 * Context: User context only. This function may sleep if pagefaults are
374 * enabled.
370 * 375 *
371 * Copy data from user space to kernel space. Caller must check 376 * Copy data from user space to kernel space. Caller must check
372 * the specified block with access_ok() before calling this function. 377 * the specified block with access_ok() before calling this function.
@@ -437,7 +442,8 @@ static inline unsigned long __must_check copy_from_user(void *to,
437 * @from: Source address, in user space. 442 * @from: Source address, in user space.
438 * @n: Number of bytes to copy. 443 * @n: Number of bytes to copy.
439 * 444 *
440 * Context: User context only. This function may sleep. 445 * Context: User context only. This function may sleep if pagefaults are
446 * enabled.
441 * 447 *
442 * Copy data from user space to user space. Caller must check 448 * Copy data from user space to user space. Caller must check
443 * the specified blocks with access_ok() before calling this function. 449 * the specified blocks with access_ok() before calling this function.
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index e83cc999da02..3f4f58d34a92 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs,
354 354
355 /* 355 /*
356 * If we're in an interrupt, have no user context or are running in an 356 * If we're in an interrupt, have no user context or are running in an
357 * atomic region then we must not take the fault. 357 * region with pagefaults disabled then we must not take the fault.
358 */ 358 */
359 if (in_atomic() || !mm) { 359 if (pagefault_disabled() || !mm) {
360 vma = NULL; /* happy compiler */ 360 vma = NULL; /* happy compiler */
361 goto bad_area_nosemaphore; 361 goto bad_area_nosemaphore;
362 } 362 }
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 6aa2f2625447..fcd545014e79 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
201 int idx, type; 201 int idx, type;
202 pte_t *pte; 202 pte_t *pte;
203 203
204 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 204 preempt_disable();
205 pagefault_disable(); 205 pagefault_disable();
206 206
207 /* Avoid icache flushes by disallowing atomic executable mappings. */ 207 /* Avoid icache flushes by disallowing atomic executable mappings. */
@@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr)
259 } 259 }
260 260
261 pagefault_enable(); 261 pagefault_enable();
262 preempt_enable();
262} 263}
263EXPORT_SYMBOL(__kunmap_atomic); 264EXPORT_SYMBOL(__kunmap_atomic);
264 265
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 8e4daf44e980..47ff9b7f3e5d 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/hardirq.h> 8#include <linux/hardirq.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/uaccess.h>
10#include <asm/current.h> 11#include <asm/current.h>
11#include <asm/pgtable.h> 12#include <asm/pgtable.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
@@ -35,10 +36,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
35 *code_out = SEGV_MAPERR; 36 *code_out = SEGV_MAPERR;
36 37
37 /* 38 /*
38 * If the fault was during atomic operation, don't take the fault, just 39 * If the fault was with pagefaults disabled, don't take the fault, just
39 * fail. 40 * fail.
40 */ 41 */
41 if (in_atomic()) 42 if (faulthandler_disabled())
42 goto out_nosemaphore; 43 goto out_nosemaphore;
43 44
44 if (is_user) 45 if (is_user)
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 0dc922dba915..afccef5529cc 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
218 * If we're in an interrupt or have no user 218 * If we're in an interrupt or have no user
219 * context, we must not take the fault.. 219 * context, we must not take the fault..
220 */ 220 */
221 if (in_atomic() || !mm) 221 if (faulthandler_disabled() || !mm)
222 goto no_context; 222 goto no_context;
223 223
224 if (user_mode(regs)) 224 if (user_mode(regs))
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 8f3271842533..dca71714f860 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -99,11 +99,9 @@ static __always_inline bool should_resched(void)
99 extern asmlinkage void ___preempt_schedule(void); 99 extern asmlinkage void ___preempt_schedule(void);
100# define __preempt_schedule() asm ("call ___preempt_schedule") 100# define __preempt_schedule() asm ("call ___preempt_schedule")
101 extern asmlinkage void preempt_schedule(void); 101 extern asmlinkage void preempt_schedule(void);
102# ifdef CONFIG_CONTEXT_TRACKING 102 extern asmlinkage void ___preempt_schedule_notrace(void);
103 extern asmlinkage void ___preempt_schedule_context(void); 103# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
104# define __preempt_schedule_context() asm ("call ___preempt_schedule_context") 104 extern asmlinkage void preempt_schedule_notrace(void);
105 extern asmlinkage void preempt_schedule_context(void);
106# endif
107#endif 105#endif
108 106
109#endif /* __ASM_PREEMPT_H */ 107#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 17a8dced12da..222a6a3ca2b5 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
37DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); 37DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
38DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); 38DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
39 39
40static inline struct cpumask *cpu_sibling_mask(int cpu)
41{
42 return per_cpu(cpu_sibling_map, cpu);
43}
44
45static inline struct cpumask *cpu_core_mask(int cpu)
46{
47 return per_cpu(cpu_core_map, cpu);
48}
49
50static inline struct cpumask *cpu_llc_shared_mask(int cpu) 40static inline struct cpumask *cpu_llc_shared_mask(int cpu)
51{ 41{
52 return per_cpu(cpu_llc_shared_map, cpu); 42 return per_cpu(cpu_llc_shared_map, cpu);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f2c26f..5a77593fdace 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
124 124
125#ifdef ENABLE_TOPO_DEFINES 125#ifdef ENABLE_TOPO_DEFINES
126#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 126#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
127#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 127#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
128#endif 128#endif
129 129
130static inline void arch_fix_phys_package_id(int num, u32 slot) 130static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ace9dec050b1..a8df874f3e88 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
74 * @addr: User space pointer to start of block to check 74 * @addr: User space pointer to start of block to check
75 * @size: Size of block to check 75 * @size: Size of block to check
76 * 76 *
77 * Context: User context only. This function may sleep. 77 * Context: User context only. This function may sleep if pagefaults are
78 * enabled.
78 * 79 *
79 * Checks if a pointer to a block of memory in user space is valid. 80 * Checks if a pointer to a block of memory in user space is valid.
80 * 81 *
@@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
145 * @x: Variable to store result. 146 * @x: Variable to store result.
146 * @ptr: Source address, in user space. 147 * @ptr: Source address, in user space.
147 * 148 *
148 * Context: User context only. This function may sleep. 149 * Context: User context only. This function may sleep if pagefaults are
150 * enabled.
149 * 151 *
150 * This macro copies a single simple variable from user space to kernel 152 * This macro copies a single simple variable from user space to kernel
151 * space. It supports simple types like char and int, but not larger 153 * space. It supports simple types like char and int, but not larger
@@ -240,7 +242,8 @@ extern void __put_user_8(void);
240 * @x: Value to copy to user space. 242 * @x: Value to copy to user space.
241 * @ptr: Destination address, in user space. 243 * @ptr: Destination address, in user space.
242 * 244 *
243 * Context: User context only. This function may sleep. 245 * Context: User context only. This function may sleep if pagefaults are
246 * enabled.
244 * 247 *
245 * This macro copies a single simple value from kernel space to user 248 * This macro copies a single simple value from kernel space to user
246 * space. It supports simple types like char and int, but not larger 249 * space. It supports simple types like char and int, but not larger
@@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; };
455 * @x: Variable to store result. 458 * @x: Variable to store result.
456 * @ptr: Source address, in user space. 459 * @ptr: Source address, in user space.
457 * 460 *
458 * Context: User context only. This function may sleep. 461 * Context: User context only. This function may sleep if pagefaults are
462 * enabled.
459 * 463 *
460 * This macro copies a single simple variable from user space to kernel 464 * This macro copies a single simple variable from user space to kernel
461 * space. It supports simple types like char and int, but not larger 465 * space. It supports simple types like char and int, but not larger
@@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; };
479 * @x: Value to copy to user space. 483 * @x: Value to copy to user space.
480 * @ptr: Destination address, in user space. 484 * @ptr: Destination address, in user space.
481 * 485 *
482 * Context: User context only. This function may sleep. 486 * Context: User context only. This function may sleep if pagefaults are
487 * enabled.
483 * 488 *
484 * This macro copies a single simple value from kernel space to user 489 * This macro copies a single simple value from kernel space to user
485 * space. It supports simple types like char and int, but not larger 490 * space. It supports simple types like char and int, but not larger
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 3c03a5de64d3..7c8ad3451988 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -70,7 +70,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
70 * @from: Source address, in kernel space. 70 * @from: Source address, in kernel space.
71 * @n: Number of bytes to copy. 71 * @n: Number of bytes to copy.
72 * 72 *
73 * Context: User context only. This function may sleep. 73 * Context: User context only. This function may sleep if pagefaults are
74 * enabled.
74 * 75 *
75 * Copy data from kernel space to user space. Caller must check 76 * Copy data from kernel space to user space. Caller must check
76 * the specified block with access_ok() before calling this function. 77 * the specified block with access_ok() before calling this function.
@@ -117,7 +118,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
117 * @from: Source address, in user space. 118 * @from: Source address, in user space.
118 * @n: Number of bytes to copy. 119 * @n: Number of bytes to copy.
119 * 120 *
120 * Context: User context only. This function may sleep. 121 * Context: User context only. This function may sleep if pagefaults are
122 * enabled.
121 * 123 *
122 * Copy data from user space to kernel space. Caller must check 124 * Copy data from user space to kernel space. Caller must check
123 * the specified block with access_ok() before calling this function. 125 * the specified block with access_ok() before calling this function.
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 19980d9a6cc9..b9826a981fb2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2576,7 +2576,7 @@ static void intel_pmu_cpu_starting(int cpu)
2576 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { 2576 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
2577 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; 2577 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
2578 2578
2579 for_each_cpu(i, topology_thread_cpumask(cpu)) { 2579 for_each_cpu(i, topology_sibling_cpumask(cpu)) {
2580 struct intel_shared_regs *pc; 2580 struct intel_shared_regs *pc;
2581 2581
2582 pc = per_cpu(cpu_hw_events, i).shared_regs; 2582 pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2594,7 +2594,7 @@ static void intel_pmu_cpu_starting(int cpu)
2594 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; 2594 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
2595 2595
2596 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { 2596 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
2597 for_each_cpu(i, topology_thread_cpumask(cpu)) { 2597 for_each_cpu(i, topology_sibling_cpumask(cpu)) {
2598 struct intel_excl_cntrs *c; 2598 struct intel_excl_cntrs *c;
2599 2599
2600 c = per_cpu(cpu_hw_events, i).excl_cntrs; 2600 c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -3362,7 +3362,7 @@ static __init int fixup_ht_bug(void)
3362 if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) 3362 if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
3363 return 0; 3363 return 0;
3364 3364
3365 w = cpumask_weight(topology_thread_cpumask(cpu)); 3365 w = cpumask_weight(topology_sibling_cpumask(cpu));
3366 if (w > 1) { 3366 if (w > 1) {
3367 pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); 3367 pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
3368 return 0; 3368 return 0;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e7d8c7608471..18ca99f2798b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
12{ 12{
13#ifdef CONFIG_SMP 13#ifdef CONFIG_SMP
14 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 14 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
15 seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); 15 seq_printf(m, "siblings\t: %d\n",
16 cpumask_weight(topology_core_cpumask(cpu)));
16 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 17 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
17 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 18 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
18 seq_printf(m, "apicid\t\t: %d\n", c->apicid); 19 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 05fd74f537d6..64341aa485ae 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -40,7 +40,5 @@ EXPORT_SYMBOL(empty_zero_page);
40 40
41#ifdef CONFIG_PREEMPT 41#ifdef CONFIG_PREEMPT
42EXPORT_SYMBOL(___preempt_schedule); 42EXPORT_SYMBOL(___preempt_schedule);
43#ifdef CONFIG_CONTEXT_TRACKING 43EXPORT_SYMBOL(___preempt_schedule_notrace);
44EXPORT_SYMBOL(___preempt_schedule_context);
45#endif
46#endif 44#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6e338e3b1dc0..c648139d68d7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -445,11 +445,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
445} 445}
446 446
447/* 447/*
448 * MONITOR/MWAIT with no hints, used for default default C1 state. 448 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
449 * This invokes MWAIT with interrutps enabled and no flags, 449 * with interrupts enabled and no flags, which is backwards compatible with the
450 * which is backwards compatible with the original MWAIT implementation. 450 * original MWAIT implementation.
451 */ 451 */
452
453static void mwait_idle(void) 452static void mwait_idle(void)
454{ 453{
455 if (!current_set_polling_and_test()) { 454 if (!current_set_polling_and_test()) {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547eac8cd..0e8209619455 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -314,10 +314,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
314 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); 314 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
315} 315}
316 316
317#define link_mask(_m, c1, c2) \ 317#define link_mask(mfunc, c1, c2) \
318do { \ 318do { \
319 cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ 319 cpumask_set_cpu((c1), mfunc(c2)); \
320 cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ 320 cpumask_set_cpu((c2), mfunc(c1)); \
321} while (0) 321} while (0)
322 322
323static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) 323static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -398,9 +398,9 @@ void set_cpu_sibling_map(int cpu)
398 cpumask_set_cpu(cpu, cpu_sibling_setup_mask); 398 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
399 399
400 if (!has_mp) { 400 if (!has_mp) {
401 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 401 cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
402 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); 402 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
403 cpumask_set_cpu(cpu, cpu_core_mask(cpu)); 403 cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
404 c->booted_cores = 1; 404 c->booted_cores = 1;
405 return; 405 return;
406 } 406 }
@@ -409,32 +409,34 @@ void set_cpu_sibling_map(int cpu)
409 o = &cpu_data(i); 409 o = &cpu_data(i);
410 410
411 if ((i == cpu) || (has_smt && match_smt(c, o))) 411 if ((i == cpu) || (has_smt && match_smt(c, o)))
412 link_mask(sibling, cpu, i); 412 link_mask(topology_sibling_cpumask, cpu, i);
413 413
414 if ((i == cpu) || (has_mp && match_llc(c, o))) 414 if ((i == cpu) || (has_mp && match_llc(c, o)))
415 link_mask(llc_shared, cpu, i); 415 link_mask(cpu_llc_shared_mask, cpu, i);
416 416
417 } 417 }
418 418
419 /* 419 /*
420 * This needs a separate iteration over the cpus because we rely on all 420 * This needs a separate iteration over the cpus because we rely on all
421 * cpu_sibling_mask links to be set-up. 421 * topology_sibling_cpumask links to be set-up.
422 */ 422 */
423 for_each_cpu(i, cpu_sibling_setup_mask) { 423 for_each_cpu(i, cpu_sibling_setup_mask) {
424 o = &cpu_data(i); 424 o = &cpu_data(i);
425 425
426 if ((i == cpu) || (has_mp && match_die(c, o))) { 426 if ((i == cpu) || (has_mp && match_die(c, o))) {
427 link_mask(core, cpu, i); 427 link_mask(topology_core_cpumask, cpu, i);
428 428
429 /* 429 /*
430 * Does this new cpu bringup a new core? 430 * Does this new cpu bringup a new core?
431 */ 431 */
432 if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) { 432 if (cpumask_weight(
433 topology_sibling_cpumask(cpu)) == 1) {
433 /* 434 /*
434 * for each core in package, increment 435 * for each core in package, increment
435 * the booted_cores for this new cpu 436 * the booted_cores for this new cpu
436 */ 437 */
437 if (cpumask_first(cpu_sibling_mask(i)) == i) 438 if (cpumask_first(
439 topology_sibling_cpumask(i)) == i)
438 c->booted_cores++; 440 c->booted_cores++;
439 /* 441 /*
440 * increment the core count for all 442 * increment the core count for all
@@ -1009,8 +1011,8 @@ static __init void disable_smp(void)
1009 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1011 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1010 else 1012 else
1011 physid_set_mask_of_physid(0, &phys_cpu_present_map); 1013 physid_set_mask_of_physid(0, &phys_cpu_present_map);
1012 cpumask_set_cpu(0, cpu_sibling_mask(0)); 1014 cpumask_set_cpu(0, topology_sibling_cpumask(0));
1013 cpumask_set_cpu(0, cpu_core_mask(0)); 1015 cpumask_set_cpu(0, topology_core_cpumask(0));
1014} 1016}
1015 1017
1016enum { 1018enum {
@@ -1293,22 +1295,22 @@ static void remove_siblinginfo(int cpu)
1293 int sibling; 1295 int sibling;
1294 struct cpuinfo_x86 *c = &cpu_data(cpu); 1296 struct cpuinfo_x86 *c = &cpu_data(cpu);
1295 1297
1296 for_each_cpu(sibling, cpu_core_mask(cpu)) { 1298 for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1297 cpumask_clear_cpu(cpu, cpu_core_mask(sibling)); 1299 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1298 /*/ 1300 /*/
1299 * last thread sibling in this cpu core going down 1301 * last thread sibling in this cpu core going down
1300 */ 1302 */
1301 if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) 1303 if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
1302 cpu_data(sibling).booted_cores--; 1304 cpu_data(sibling).booted_cores--;
1303 } 1305 }
1304 1306
1305 for_each_cpu(sibling, cpu_sibling_mask(cpu)) 1307 for_each_cpu(sibling, topology_sibling_cpumask(cpu))
1306 cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling)); 1308 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1307 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) 1309 for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1308 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); 1310 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
1309 cpumask_clear(cpu_llc_shared_mask(cpu)); 1311 cpumask_clear(cpu_llc_shared_mask(cpu));
1310 cpumask_clear(cpu_sibling_mask(cpu)); 1312 cpumask_clear(topology_sibling_cpumask(cpu));
1311 cpumask_clear(cpu_core_mask(cpu)); 1313 cpumask_clear(topology_core_cpumask(cpu));
1312 c->phys_proc_id = 0; 1314 c->phys_proc_id = 0;
1313 c->cpu_core_id = 0; 1315 c->cpu_core_id = 0;
1314 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); 1316 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 26488487bc61..dd8d0791dfb5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ static void check_tsc_warp(unsigned int timeout)
113 */ 113 */
114static inline unsigned int loop_timeout(int cpu) 114static inline unsigned int loop_timeout(int cpu)
115{ 115{
116 return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; 116 return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
117} 117}
118 118
119/* 119/*
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 37d8fa4438f0..a0695be19864 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -75,7 +75,5 @@ EXPORT_SYMBOL(native_load_gs_index);
75 75
76#ifdef CONFIG_PREEMPT 76#ifdef CONFIG_PREEMPT
77EXPORT_SYMBOL(___preempt_schedule); 77EXPORT_SYMBOL(___preempt_schedule);
78#ifdef CONFIG_CONTEXT_TRACKING 78EXPORT_SYMBOL(___preempt_schedule_notrace);
79EXPORT_SYMBOL(___preempt_schedule_context);
80#endif
81#endif 79#endif
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 5eb715087b80..e407941d0488 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -38,8 +38,6 @@
38 38
39#ifdef CONFIG_PREEMPT 39#ifdef CONFIG_PREEMPT
40 THUNK ___preempt_schedule, preempt_schedule 40 THUNK ___preempt_schedule, preempt_schedule
41#ifdef CONFIG_CONTEXT_TRACKING 41 THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
42 THUNK ___preempt_schedule_context, preempt_schedule_context
43#endif
44#endif 42#endif
45 43
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index f89ba4e93025..2198902329b5 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -49,9 +49,7 @@
49 49
50#ifdef CONFIG_PREEMPT 50#ifdef CONFIG_PREEMPT
51 THUNK ___preempt_schedule, preempt_schedule 51 THUNK ___preempt_schedule, preempt_schedule
52#ifdef CONFIG_CONTEXT_TRACKING 52 THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
53 THUNK ___preempt_schedule_context, preempt_schedule_context
54#endif
55#endif 53#endif
56 54
57#if defined(CONFIG_TRACE_IRQFLAGS) \ 55#if defined(CONFIG_TRACE_IRQFLAGS) \
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e2f5e21c03b3..91d93b95bd86 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
647 * @from: Source address, in kernel space. 647 * @from: Source address, in kernel space.
648 * @n: Number of bytes to copy. 648 * @n: Number of bytes to copy.
649 * 649 *
650 * Context: User context only. This function may sleep. 650 * Context: User context only. This function may sleep if pagefaults are
651 * enabled.
651 * 652 *
652 * Copy data from kernel space to user space. 653 * Copy data from kernel space to user space.
653 * 654 *
@@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user);
668 * @from: Source address, in user space. 669 * @from: Source address, in user space.
669 * @n: Number of bytes to copy. 670 * @n: Number of bytes to copy.
670 * 671 *
671 * Context: User context only. This function may sleep. 672 * Context: User context only. This function may sleep if pagefaults are
673 * enabled.
672 * 674 *
673 * Copy data from user space to kernel space. 675 * Copy data from user space to kernel space.
674 * 676 *
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 181c53bac3a7..9dc909841739 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
13#include <linux/hugetlb.h> /* hstate_index_to_shift */ 13#include <linux/hugetlb.h> /* hstate_index_to_shift */
14#include <linux/prefetch.h> /* prefetchw */ 14#include <linux/prefetch.h> /* prefetchw */
15#include <linux/context_tracking.h> /* exception_enter(), ... */ 15#include <linux/context_tracking.h> /* exception_enter(), ... */
16#include <linux/uaccess.h> /* faulthandler_disabled() */
16 17
17#include <asm/traps.h> /* dotraplinkage, ... */ 18#include <asm/traps.h> /* dotraplinkage, ... */
18#include <asm/pgalloc.h> /* pgd_*(), ... */ 19#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1126 1127
1127 /* 1128 /*
1128 * If we're in an interrupt, have no user context or are running 1129 * If we're in an interrupt, have no user context or are running
1129 * in an atomic region then we must not take the fault: 1130 * in a region with pagefaults disabled then we must not take the fault
1130 */ 1131 */
1131 if (unlikely(in_atomic() || !mm)) { 1132 if (unlikely(faulthandler_disabled() || !mm)) {
1132 bad_area_nosemaphore(regs, error_code, address); 1133 bad_area_nosemaphore(regs, error_code, address);
1133 return; 1134 return;
1134 } 1135 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 4500142bc4aa..eecb207a2037 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
35 unsigned long vaddr; 35 unsigned long vaddr;
36 int idx, type; 36 int idx, type;
37 37
38 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 38 preempt_disable();
39 pagefault_disable(); 39 pagefault_disable();
40 40
41 if (!PageHighMem(page)) 41 if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
100#endif 100#endif
101 101
102 pagefault_enable(); 102 pagefault_enable();
103 preempt_enable();
103} 104}
104EXPORT_SYMBOL(__kunmap_atomic); 105EXPORT_SYMBOL(__kunmap_atomic);
105 106
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9ca35fc60cfe..2b7ece0e103a 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
59 unsigned long vaddr; 59 unsigned long vaddr;
60 int idx, type; 60 int idx, type;
61 61
62 preempt_disable();
62 pagefault_disable(); 63 pagefault_disable();
63 64
64 type = kmap_atomic_idx_push(); 65 type = kmap_atomic_idx_push();
@@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr)
117 } 118 }
118 119
119 pagefault_enable(); 120 pagefault_enable();
121 preempt_enable();
120} 122}
121EXPORT_SYMBOL_GPL(iounmap_atomic); 123EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 9e3571a6535c..83a44a33cfa1 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -15,10 +15,10 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/hardirq.h> 17#include <linux/hardirq.h>
18#include <linux/uaccess.h>
18#include <asm/mmu_context.h> 19#include <asm/mmu_context.h>
19#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
20#include <asm/hardirq.h> 21#include <asm/hardirq.h>
21#include <asm/uaccess.h>
22#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
23 23
24DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST; 24DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
57 /* If we're in an interrupt or have no user 57 /* If we're in an interrupt or have no user
58 * context, we must not take the fault.. 58 * context, we must not take the fault..
59 */ 59 */
60 if (in_atomic() || !mm) { 60 if (faulthandler_disabled() || !mm) {
61 bad_page_fault(regs, address, SIGSEGV); 61 bad_page_fault(regs, address, SIGSEGV);
62 return; 62 return;
63 } 63 }
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c
index 8cfb71ec0937..184ceadccc1a 100644
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
42 enum fixed_addresses idx; 42 enum fixed_addresses idx;
43 unsigned long vaddr; 43 unsigned long vaddr;
44 44
45 preempt_disable();
45 pagefault_disable(); 46 pagefault_disable();
46 if (!PageHighMem(page)) 47 if (!PageHighMem(page))
47 return page_address(page); 48 return page_address(page);
@@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr)
79 } 80 }
80 81
81 pagefault_enable(); 82 pagefault_enable();
83 preempt_enable();
82} 84}
83EXPORT_SYMBOL(__kunmap_atomic); 85EXPORT_SYMBOL(__kunmap_atomic);
84 86
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0bcce..1e28ddb656b8 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
24{ 24{
25 unsigned int ret; 25 unsigned int ret;
26 26
27 ret = cpumask_first(topology_thread_cpumask(cpu)); 27 ret = cpumask_first(topology_sibling_cpumask(cpu));
28 if (ret < nr_cpu_ids) 28 if (ret < nr_cpu_ids)
29 return ret; 29 return ret;
30 30
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6bc9cbc01ad6..00b39802d7ec 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -105,7 +105,7 @@ static void round_robin_cpu(unsigned int tsk_index)
105 mutex_lock(&round_robin_lock); 105 mutex_lock(&round_robin_lock);
106 cpumask_clear(tmp); 106 cpumask_clear(tmp);
107 for_each_cpu(cpu, pad_busy_cpus) 107 for_each_cpu(cpu, pad_busy_cpus)
108 cpumask_or(tmp, tmp, topology_thread_cpumask(cpu)); 108 cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu));
109 cpumask_andnot(tmp, cpu_online_mask, tmp); 109 cpumask_andnot(tmp, cpu_online_mask, tmp);
110 /* avoid HT sibilings if possible */ 110 /* avoid HT sibilings if possible */
111 if (cpumask_empty(tmp)) 111 if (cpumask_empty(tmp))
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 6491f45200a7..8b7d7f8e5851 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -61,7 +61,7 @@ static DEVICE_ATTR_RO(physical_package_id);
61define_id_show_func(core_id); 61define_id_show_func(core_id);
62static DEVICE_ATTR_RO(core_id); 62static DEVICE_ATTR_RO(core_id);
63 63
64define_siblings_show_func(thread_siblings, thread_cpumask); 64define_siblings_show_func(thread_siblings, sibling_cpumask);
65static DEVICE_ATTR_RO(thread_siblings); 65static DEVICE_ATTR_RO(thread_siblings);
66static DEVICE_ATTR_RO(thread_siblings_list); 66static DEVICE_ATTR_RO(thread_siblings_list);
67 67
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index b0c18ed8d83f..0136dfcdabf0 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -699,13 +699,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
699 dmi_check_system(sw_any_bug_dmi_table); 699 dmi_check_system(sw_any_bug_dmi_table);
700 if (bios_with_sw_any_bug && !policy_is_shared(policy)) { 700 if (bios_with_sw_any_bug && !policy_is_shared(policy)) {
701 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; 701 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
702 cpumask_copy(policy->cpus, cpu_core_mask(cpu)); 702 cpumask_copy(policy->cpus, topology_core_cpumask(cpu));
703 } 703 }
704 704
705 if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) { 705 if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) {
706 cpumask_clear(policy->cpus); 706 cpumask_clear(policy->cpus);
707 cpumask_set_cpu(cpu, policy->cpus); 707 cpumask_set_cpu(cpu, policy->cpus);
708 cpumask_copy(data->freqdomain_cpus, cpu_sibling_mask(cpu)); 708 cpumask_copy(data->freqdomain_cpus,
709 topology_sibling_cpumask(cpu));
709 policy->shared_type = CPUFREQ_SHARED_TYPE_HW; 710 policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
710 pr_info_once(PFX "overriding BIOS provided _PSD data\n"); 711 pr_info_once(PFX "overriding BIOS provided _PSD data\n");
711 } 712 }
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
index 529cfd92158f..5dd95dab580d 100644
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -172,7 +172,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
172 unsigned int i; 172 unsigned int i;
173 173
174#ifdef CONFIG_SMP 174#ifdef CONFIG_SMP
175 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 175 cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
176#endif 176#endif
177 177
178 /* Errata workaround */ 178 /* Errata workaround */
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
index f9ce7e4bf0fe..5c035d04d827 100644
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -57,13 +57,6 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
57 57
58static struct cpufreq_driver cpufreq_amd64_driver; 58static struct cpufreq_driver cpufreq_amd64_driver;
59 59
60#ifndef CONFIG_SMP
61static inline const struct cpumask *cpu_core_mask(int cpu)
62{
63 return cpumask_of(0);
64}
65#endif
66
67/* Return a frequency in MHz, given an input fid */ 60/* Return a frequency in MHz, given an input fid */
68static u32 find_freq_from_fid(u32 fid) 61static u32 find_freq_from_fid(u32 fid)
69{ 62{
@@ -620,7 +613,7 @@ static int fill_powernow_table(struct powernow_k8_data *data,
620 613
621 pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); 614 pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
622 data->powernow_table = powernow_table; 615 data->powernow_table = powernow_table;
623 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 616 if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
624 print_basics(data); 617 print_basics(data);
625 618
626 for (j = 0; j < data->numps; j++) 619 for (j = 0; j < data->numps; j++)
@@ -784,7 +777,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
784 CPUFREQ_TABLE_END; 777 CPUFREQ_TABLE_END;
785 data->powernow_table = powernow_table; 778 data->powernow_table = powernow_table;
786 779
787 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 780 if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
788 print_basics(data); 781 print_basics(data);
789 782
790 /* notify BIOS that we exist */ 783 /* notify BIOS that we exist */
@@ -1090,7 +1083,7 @@ static int powernowk8_cpu_init(struct cpufreq_policy *pol)
1090 if (rc != 0) 1083 if (rc != 0)
1091 goto err_out_exit_acpi; 1084 goto err_out_exit_acpi;
1092 1085
1093 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); 1086 cpumask_copy(pol->cpus, topology_core_cpumask(pol->cpu));
1094 data->available_cores = pol->cpus; 1087 data->available_cores = pol->cpus;
1095 1088
1096 /* min/max the cpu is capable of */ 1089 /* min/max the cpu is capable of */
diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c
index e56d632a8b21..37555c6b86a7 100644
--- a/drivers/cpufreq/speedstep-ich.c
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
292 292
293 /* only run on CPU to be set, or on its sibling */ 293 /* only run on CPU to be set, or on its sibling */
294#ifdef CONFIG_SMP 294#ifdef CONFIG_SMP
295 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 295 cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
296#endif 296#endif
297 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); 297 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
298 298
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index ab300ea19434..a9064e36e7b5 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -78,12 +78,14 @@ static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
78 int ret; 78 int ret;
79 struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm); 79 struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
80 80
81 preempt_disable();
81 pagefault_disable(); 82 pagefault_disable();
82 enable_kernel_altivec(); 83 enable_kernel_altivec();
83 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); 84 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
84 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key); 85 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
85 pagefault_enable(); 86 pagefault_enable();
86 87 preempt_enable();
88
87 ret += crypto_cipher_setkey(ctx->fallback, key, keylen); 89 ret += crypto_cipher_setkey(ctx->fallback, key, keylen);
88 return ret; 90 return ret;
89} 91}
@@ -95,10 +97,12 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
95 if (in_interrupt()) { 97 if (in_interrupt()) {
96 crypto_cipher_encrypt_one(ctx->fallback, dst, src); 98 crypto_cipher_encrypt_one(ctx->fallback, dst, src);
97 } else { 99 } else {
100 preempt_disable();
98 pagefault_disable(); 101 pagefault_disable();
99 enable_kernel_altivec(); 102 enable_kernel_altivec();
100 aes_p8_encrypt(src, dst, &ctx->enc_key); 103 aes_p8_encrypt(src, dst, &ctx->enc_key);
101 pagefault_enable(); 104 pagefault_enable();
105 preempt_enable();
102 } 106 }
103} 107}
104 108
@@ -109,10 +113,12 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
109 if (in_interrupt()) { 113 if (in_interrupt()) {
110 crypto_cipher_decrypt_one(ctx->fallback, dst, src); 114 crypto_cipher_decrypt_one(ctx->fallback, dst, src);
111 } else { 115 } else {
116 preempt_disable();
112 pagefault_disable(); 117 pagefault_disable();
113 enable_kernel_altivec(); 118 enable_kernel_altivec();
114 aes_p8_decrypt(src, dst, &ctx->dec_key); 119 aes_p8_decrypt(src, dst, &ctx->dec_key);
115 pagefault_enable(); 120 pagefault_enable();
121 preempt_enable();
116 } 122 }
117} 123}
118 124
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index 1a559b7dddb5..477284abdd11 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -79,11 +79,13 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
79 int ret; 79 int ret;
80 struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); 80 struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
81 81
82 preempt_disable();
82 pagefault_disable(); 83 pagefault_disable();
83 enable_kernel_altivec(); 84 enable_kernel_altivec();
84 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); 85 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
85 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key); 86 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
86 pagefault_enable(); 87 pagefault_enable();
88 preempt_enable();
87 89
88 ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen); 90 ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen);
89 return ret; 91 return ret;
@@ -106,6 +108,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
106 if (in_interrupt()) { 108 if (in_interrupt()) {
107 ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes); 109 ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes);
108 } else { 110 } else {
111 preempt_disable();
109 pagefault_disable(); 112 pagefault_disable();
110 enable_kernel_altivec(); 113 enable_kernel_altivec();
111 114
@@ -119,6 +122,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
119 } 122 }
120 123
121 pagefault_enable(); 124 pagefault_enable();
125 preempt_enable();
122 } 126 }
123 127
124 return ret; 128 return ret;
@@ -141,6 +145,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
141 if (in_interrupt()) { 145 if (in_interrupt()) {
142 ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes); 146 ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes);
143 } else { 147 } else {
148 preempt_disable();
144 pagefault_disable(); 149 pagefault_disable();
145 enable_kernel_altivec(); 150 enable_kernel_altivec();
146 151
@@ -154,6 +159,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
154 } 159 }
155 160
156 pagefault_enable(); 161 pagefault_enable();
162 preempt_enable();
157 } 163 }
158 164
159 return ret; 165 return ret;
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index d0ffe277af5c..f255ec4a04d4 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -114,11 +114,13 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
114 if (keylen != GHASH_KEY_LEN) 114 if (keylen != GHASH_KEY_LEN)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 preempt_disable();
117 pagefault_disable(); 118 pagefault_disable();
118 enable_kernel_altivec(); 119 enable_kernel_altivec();
119 enable_kernel_fp(); 120 enable_kernel_fp();
120 gcm_init_p8(ctx->htable, (const u64 *) key); 121 gcm_init_p8(ctx->htable, (const u64 *) key);
121 pagefault_enable(); 122 pagefault_enable();
123 preempt_enable();
122 return crypto_shash_setkey(ctx->fallback, key, keylen); 124 return crypto_shash_setkey(ctx->fallback, key, keylen);
123} 125}
124 126
@@ -140,23 +142,27 @@ static int p8_ghash_update(struct shash_desc *desc,
140 } 142 }
141 memcpy(dctx->buffer + dctx->bytes, src, 143 memcpy(dctx->buffer + dctx->bytes, src,
142 GHASH_DIGEST_SIZE - dctx->bytes); 144 GHASH_DIGEST_SIZE - dctx->bytes);
145 preempt_disable();
143 pagefault_disable(); 146 pagefault_disable();
144 enable_kernel_altivec(); 147 enable_kernel_altivec();
145 enable_kernel_fp(); 148 enable_kernel_fp();
146 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer, 149 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
147 GHASH_DIGEST_SIZE); 150 GHASH_DIGEST_SIZE);
148 pagefault_enable(); 151 pagefault_enable();
152 preempt_enable();
149 src += GHASH_DIGEST_SIZE - dctx->bytes; 153 src += GHASH_DIGEST_SIZE - dctx->bytes;
150 srclen -= GHASH_DIGEST_SIZE - dctx->bytes; 154 srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
151 dctx->bytes = 0; 155 dctx->bytes = 0;
152 } 156 }
153 len = srclen & ~(GHASH_DIGEST_SIZE - 1); 157 len = srclen & ~(GHASH_DIGEST_SIZE - 1);
154 if (len) { 158 if (len) {
159 preempt_disable();
155 pagefault_disable(); 160 pagefault_disable();
156 enable_kernel_altivec(); 161 enable_kernel_altivec();
157 enable_kernel_fp(); 162 enable_kernel_fp();
158 gcm_ghash_p8(dctx->shash, ctx->htable, src, len); 163 gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
159 pagefault_enable(); 164 pagefault_enable();
165 preempt_enable();
160 src += len; 166 src += len;
161 srclen -= len; 167 srclen -= len;
162 } 168 }
@@ -180,12 +186,14 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
180 if (dctx->bytes) { 186 if (dctx->bytes) {
181 for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++) 187 for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
182 dctx->buffer[i] = 0; 188 dctx->buffer[i] = 0;
189 preempt_disable();
183 pagefault_disable(); 190 pagefault_disable();
184 enable_kernel_altivec(); 191 enable_kernel_altivec();
185 enable_kernel_fp(); 192 enable_kernel_fp();
186 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer, 193 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
187 GHASH_DIGEST_SIZE); 194 GHASH_DIGEST_SIZE);
188 pagefault_enable(); 195 pagefault_enable();
196 preempt_enable();
189 dctx->bytes = 0; 197 dctx->bytes = 0;
190 } 198 }
191 memcpy(out, dctx->shash, GHASH_DIGEST_SIZE); 199 memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index a3190e793ed4..cc552a4c1f3b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
32#include "i915_trace.h" 32#include "i915_trace.h"
33#include "intel_drv.h" 33#include "intel_drv.h"
34#include <linux/dma_remapping.h> 34#include <linux/dma_remapping.h>
35#include <linux/uaccess.h>
35 36
36#define __EXEC_OBJECT_HAS_PIN (1<<31) 37#define __EXEC_OBJECT_HAS_PIN (1<<31)
37#define __EXEC_OBJECT_HAS_FENCE (1<<30) 38#define __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -465,7 +466,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
465 } 466 }
466 467
467 /* We can't wait for rendering with pagefaults disabled */ 468 /* We can't wait for rendering with pagefaults disabled */
468 if (obj->active && in_atomic()) 469 if (obj->active && pagefault_disabled())
469 return -EFAULT; 470 return -EFAULT;
470 471
471 if (use_cpu_reloc(obj)) 472 if (use_cpu_reloc(obj))
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ed303ba3a593..3e03379e7c5d 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -63,7 +63,8 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
63#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) 63#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66#define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) 66#define for_each_sibling(i, cpu) \
67 for_each_cpu(i, topology_sibling_cpumask(cpu))
67#else 68#else
68#define for_each_sibling(i, cpu) for (i = 0; false; ) 69#define for_each_sibling(i, cpu) for (i = 0; false; )
69#endif 70#endif
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 4b00545a3ace..65944dd8bf6b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1304,7 +1304,7 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
1304 if (!cpumask_test_cpu(cpu, thread_mask)) { 1304 if (!cpumask_test_cpu(cpu, thread_mask)) {
1305 ++count; 1305 ++count;
1306 cpumask_or(thread_mask, thread_mask, 1306 cpumask_or(thread_mask, thread_mask,
1307 topology_thread_cpumask(cpu)); 1307 topology_sibling_cpumask(cpu));
1308 } 1308 }
1309 } 1309 }
1310 1310
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
index cc3ab351943e..f9262243f935 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -87,7 +87,7 @@ static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
87/* return cpumask of HTs in the same core */ 87/* return cpumask of HTs in the same core */
88static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask) 88static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
89{ 89{
90 cpumask_copy(mask, topology_thread_cpumask(cpu)); 90 cpumask_copy(mask, topology_sibling_cpumask(cpu));
91} 91}
92 92
93static void cfs_node_to_cpumask(int node, cpumask_t *mask) 93static void cfs_node_to_cpumask(int node, cpumask_t *mask)
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 8e61421515cb..344189ac5698 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -557,7 +557,7 @@ ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
557 * there are. 557 * there are.
558 */ 558 */
559 /* weight is # of HTs */ 559 /* weight is # of HTs */
560 if (cpumask_weight(topology_thread_cpumask(0)) > 1) { 560 if (cpumask_weight(topology_sibling_cpumask(0)) > 1) {
561 /* depress thread factor for hyper-thread */ 561 /* depress thread factor for hyper-thread */
562 factor = factor - (factor >> 1) + (factor >> 3); 562 factor = factor - (factor >> 1) + (factor >> 3);
563 } 563 }
@@ -2768,7 +2768,7 @@ int ptlrpc_hr_init(void)
2768 2768
2769 init_waitqueue_head(&ptlrpc_hr.hr_waitq); 2769 init_waitqueue_head(&ptlrpc_hr.hr_waitq);
2770 2770
2771 weight = cpumask_weight(topology_thread_cpumask(0)); 2771 weight = cpumask_weight(topology_sibling_cpumask(0));
2772 2772
2773 cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { 2773 cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
2774 hrp->hrp_cpt = i; 2774 hrp->hrp_cpt = i;
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index b59b5a52637e..e56272c919b5 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -8,8 +8,7 @@
8#ifndef CONFIG_SMP 8#ifndef CONFIG_SMP
9/* 9/*
10 * The following implementation only for uniprocessor machines. 10 * The following implementation only for uniprocessor machines.
11 * For UP, it's relies on the fact that pagefault_disable() also disables 11 * It relies on preempt_disable() ensuring mutual exclusion.
12 * preemption to ensure mutual exclusion.
13 * 12 *
14 */ 13 */
15 14
@@ -38,6 +37,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
38 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 37 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
39 oparg = 1 << oparg; 38 oparg = 1 << oparg;
40 39
40 preempt_disable();
41 pagefault_disable(); 41 pagefault_disable();
42 42
43 ret = -EFAULT; 43 ret = -EFAULT;
@@ -72,6 +72,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
72 72
73out_pagefault_enable: 73out_pagefault_enable:
74 pagefault_enable(); 74 pagefault_enable();
75 preempt_enable();
75 76
76 if (ret == 0) { 77 if (ret == 0) {
77 switch (cmp) { 78 switch (cmp) {
@@ -106,6 +107,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
106{ 107{
107 u32 val; 108 u32 val;
108 109
110 preempt_disable();
109 if (unlikely(get_user(val, uaddr) != 0)) 111 if (unlikely(get_user(val, uaddr) != 0))
110 return -EFAULT; 112 return -EFAULT;
111 113
@@ -113,6 +115,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 return -EFAULT; 115 return -EFAULT;
114 116
115 *uval = val; 117 *uval = val;
118 preempt_enable();
116 119
117 return 0; 120 return 0;
118} 121}
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index eb6f9e6c3075..d0a7a4753db2 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -79,11 +79,8 @@ static __always_inline bool should_resched(void)
79#ifdef CONFIG_PREEMPT 79#ifdef CONFIG_PREEMPT
80extern asmlinkage void preempt_schedule(void); 80extern asmlinkage void preempt_schedule(void);
81#define __preempt_schedule() preempt_schedule() 81#define __preempt_schedule() preempt_schedule()
82 82extern asmlinkage void preempt_schedule_notrace(void);
83#ifdef CONFIG_CONTEXT_TRACKING 83#define __preempt_schedule_notrace() preempt_schedule_notrace()
84extern asmlinkage void preempt_schedule_context(void);
85#define __preempt_schedule_context() preempt_schedule_context()
86#endif
87#endif /* CONFIG_PREEMPT */ 84#endif /* CONFIG_PREEMPT */
88 85
89#endif /* __ASM_PREEMPT_H */ 86#endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 86c12c93e3cf..8fdcb783197d 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
2#define _LINUX_BH_H 2#define _LINUX_BH_H
3 3
4#include <linux/preempt.h> 4#include <linux/preempt.h>
5#include <linux/preempt_mask.h>
6 5
7#ifdef CONFIG_TRACE_IRQFLAGS 6#ifdef CONFIG_TRACE_IRQFLAGS
8extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); 7extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b97..dfd59d6bc6f0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,7 @@
1#ifndef LINUX_HARDIRQ_H 1#ifndef LINUX_HARDIRQ_H
2#define LINUX_HARDIRQ_H 2#define LINUX_HARDIRQ_H
3 3
4#include <linux/preempt_mask.h> 4#include <linux/preempt.h>
5#include <linux/lockdep.h> 5#include <linux/lockdep.h>
6#include <linux/ftrace_irq.h> 6#include <linux/ftrace_irq.h>
7#include <linux/vtime.h> 7#include <linux/vtime.h>
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9286a46b7d69..6aefcd0031a6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -65,6 +65,7 @@ static inline void kunmap(struct page *page)
65 65
66static inline void *kmap_atomic(struct page *page) 66static inline void *kmap_atomic(struct page *page)
67{ 67{
68 preempt_disable();
68 pagefault_disable(); 69 pagefault_disable();
69 return page_address(page); 70 return page_address(page);
70} 71}
@@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page)
73static inline void __kunmap_atomic(void *addr) 74static inline void __kunmap_atomic(void *addr)
74{ 75{
75 pagefault_enable(); 76 pagefault_enable();
77 preempt_enable();
76} 78}
77 79
78#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) 80#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b31..bb9b075f0eb0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,8 @@ extern struct fs_struct init_fs;
50 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ 50 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
51 .rlim = INIT_RLIMITS, \ 51 .rlim = INIT_RLIMITS, \
52 .cputimer = { \ 52 .cputimer = { \
53 .cputime = INIT_CPUTIME, \ 53 .cputime_atomic = INIT_CPUTIME_ATOMIC, \
54 .running = 0, \ 54 .running = 0, \
55 .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
56 }, \ 55 }, \
57 .cred_guard_mutex = \ 56 .cred_guard_mutex = \
58 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ 57 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 657fab4efab3..c27dde7215b5 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -141,6 +141,7 @@ static inline void __iomem *
141io_mapping_map_atomic_wc(struct io_mapping *mapping, 141io_mapping_map_atomic_wc(struct io_mapping *mapping,
142 unsigned long offset) 142 unsigned long offset)
143{ 143{
144 preempt_disable();
144 pagefault_disable(); 145 pagefault_disable();
145 return ((char __force __iomem *) mapping) + offset; 146 return ((char __force __iomem *) mapping) + offset;
146} 147}
@@ -149,6 +150,7 @@ static inline void
149io_mapping_unmap_atomic(void __iomem *vaddr) 150io_mapping_unmap_atomic(void __iomem *vaddr)
150{ 151{
151 pagefault_enable(); 152 pagefault_enable();
153 preempt_enable();
152} 154}
153 155
154/* Non-atomic map/unmap */ 156/* Non-atomic map/unmap */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3a5b48e52a9e..060dd7b61c6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
244 244
245#if defined(CONFIG_MMU) && \ 245#if defined(CONFIG_MMU) && \
246 (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)) 246 (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
247void might_fault(void); 247#define might_fault() __might_fault(__FILE__, __LINE__)
248void __might_fault(const char *file, int line);
248#else 249#else
249static inline void might_fault(void) { } 250static inline void might_fault(void) { }
250#endif 251#endif
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f000e34b..c92ebd100d9b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
52 static struct lglock name = { .lock = &name ## _lock } 52 static struct lglock name = { .lock = &name ## _lock }
53 53
54void lg_lock_init(struct lglock *lg, char *name); 54void lg_lock_init(struct lglock *lg, char *name);
55
55void lg_local_lock(struct lglock *lg); 56void lg_local_lock(struct lglock *lg);
56void lg_local_unlock(struct lglock *lg); 57void lg_local_unlock(struct lglock *lg);
57void lg_local_lock_cpu(struct lglock *lg, int cpu); 58void lg_local_lock_cpu(struct lglock *lg, int cpu);
58void lg_local_unlock_cpu(struct lglock *lg, int cpu); 59void lg_local_unlock_cpu(struct lglock *lg, int cpu);
60
61void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
62void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
63
59void lg_global_lock(struct lglock *lg); 64void lg_global_lock(struct lglock *lg);
60void lg_global_unlock(struct lglock *lg); 65void lg_global_unlock(struct lglock *lg);
61 66
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4eb1642..0f1534acaf60 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,13 +10,117 @@
10#include <linux/list.h> 10#include <linux/list.h>
11 11
12/* 12/*
13 * We use the MSB mostly because its available; see <linux/preempt_mask.h> for 13 * We put the hardirq and softirq counter into the preemption
14 * the other bits -- can't include that header due to inclusion hell. 14 * counter. The bitmask has the following meaning:
15 *
16 * - bits 0-7 are the preemption count (max preemption depth: 256)
17 * - bits 8-15 are the softirq count (max # of softirqs: 256)
18 *
19 * The hardirq count could in theory be the same as the number of
20 * interrupts in the system, but we run all interrupt handlers with
21 * interrupts disabled, so we cannot have nesting interrupts. Though
22 * there are a few palaeontologic drivers which reenable interrupts in
23 * the handler, so we need more than one bit here.
24 *
25 * PREEMPT_MASK: 0x000000ff
26 * SOFTIRQ_MASK: 0x0000ff00
27 * HARDIRQ_MASK: 0x000f0000
28 * NMI_MASK: 0x00100000
29 * PREEMPT_ACTIVE: 0x00200000
30 * PREEMPT_NEED_RESCHED: 0x80000000
15 */ 31 */
32#define PREEMPT_BITS 8
33#define SOFTIRQ_BITS 8
34#define HARDIRQ_BITS 4
35#define NMI_BITS 1
36
37#define PREEMPT_SHIFT 0
38#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
39#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
40#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
41
42#define __IRQ_MASK(x) ((1UL << (x))-1)
43
44#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
45#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
46#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
47#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
48
49#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
50#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
51#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
52#define NMI_OFFSET (1UL << NMI_SHIFT)
53
54#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
55
56#define PREEMPT_ACTIVE_BITS 1
57#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
58#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
59
60/* We use the MSB mostly because its available */
16#define PREEMPT_NEED_RESCHED 0x80000000 61#define PREEMPT_NEED_RESCHED 0x80000000
17 62
63/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
18#include <asm/preempt.h> 64#include <asm/preempt.h>
19 65
66#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
67#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
68#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
69 | NMI_MASK))
70
71/*
72 * Are we doing bottom half or hardware interrupt processing?
73 * Are we in a softirq context? Interrupt context?
74 * in_softirq - Are we currently processing softirq or have bh disabled?
75 * in_serving_softirq - Are we currently processing softirq?
76 */
77#define in_irq() (hardirq_count())
78#define in_softirq() (softirq_count())
79#define in_interrupt() (irq_count())
80#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
81
82/*
83 * Are we in NMI context?
84 */
85#define in_nmi() (preempt_count() & NMI_MASK)
86
87#if defined(CONFIG_PREEMPT_COUNT)
88# define PREEMPT_DISABLE_OFFSET 1
89#else
90# define PREEMPT_DISABLE_OFFSET 0
91#endif
92
93/*
94 * The preempt_count offset needed for things like:
95 *
96 * spin_lock_bh()
97 *
98 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
99 * softirqs, such that unlock sequences of:
100 *
101 * spin_unlock();
102 * local_bh_enable();
103 *
104 * Work as expected.
105 */
106#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
107
108/*
109 * Are we running in atomic context? WARNING: this macro cannot
110 * always detect atomic context; in particular, it cannot know about
111 * held spinlocks in non-preemptible kernels. Thus it should not be
112 * used in the general case to determine whether sleeping is possible.
113 * Do not use in_atomic() in driver code.
114 */
115#define in_atomic() (preempt_count() != 0)
116
117/*
118 * Check whether we were atomic before we did preempt_disable():
119 * (used by the scheduler)
120 */
121#define in_atomic_preempt_off() \
122 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
123
20#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) 124#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
21extern void preempt_count_add(int val); 125extern void preempt_count_add(int val);
22extern void preempt_count_sub(int val); 126extern void preempt_count_sub(int val);
@@ -33,6 +137,18 @@ extern void preempt_count_sub(int val);
33#define preempt_count_inc() preempt_count_add(1) 137#define preempt_count_inc() preempt_count_add(1)
34#define preempt_count_dec() preempt_count_sub(1) 138#define preempt_count_dec() preempt_count_sub(1)
35 139
140#define preempt_active_enter() \
141do { \
142 preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
143 barrier(); \
144} while (0)
145
146#define preempt_active_exit() \
147do { \
148 barrier(); \
149 preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
150} while (0)
151
36#ifdef CONFIG_PREEMPT_COUNT 152#ifdef CONFIG_PREEMPT_COUNT
37 153
38#define preempt_disable() \ 154#define preempt_disable() \
@@ -49,6 +165,8 @@ do { \
49 165
50#define preempt_enable_no_resched() sched_preempt_enable_no_resched() 166#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
51 167
168#define preemptible() (preempt_count() == 0 && !irqs_disabled())
169
52#ifdef CONFIG_PREEMPT 170#ifdef CONFIG_PREEMPT
53#define preempt_enable() \ 171#define preempt_enable() \
54do { \ 172do { \
@@ -57,52 +175,46 @@ do { \
57 __preempt_schedule(); \ 175 __preempt_schedule(); \
58} while (0) 176} while (0)
59 177
178#define preempt_enable_notrace() \
179do { \
180 barrier(); \
181 if (unlikely(__preempt_count_dec_and_test())) \
182 __preempt_schedule_notrace(); \
183} while (0)
184
60#define preempt_check_resched() \ 185#define preempt_check_resched() \
61do { \ 186do { \
62 if (should_resched()) \ 187 if (should_resched()) \
63 __preempt_schedule(); \ 188 __preempt_schedule(); \
64} while (0) 189} while (0)
65 190
66#else 191#else /* !CONFIG_PREEMPT */
67#define preempt_enable() \ 192#define preempt_enable() \
68do { \ 193do { \
69 barrier(); \ 194 barrier(); \
70 preempt_count_dec(); \ 195 preempt_count_dec(); \
71} while (0) 196} while (0)
72#define preempt_check_resched() do { } while (0)
73#endif
74
75#define preempt_disable_notrace() \
76do { \
77 __preempt_count_inc(); \
78 barrier(); \
79} while (0)
80 197
81#define preempt_enable_no_resched_notrace() \ 198#define preempt_enable_notrace() \
82do { \ 199do { \
83 barrier(); \ 200 barrier(); \
84 __preempt_count_dec(); \ 201 __preempt_count_dec(); \
85} while (0) 202} while (0)
86 203
87#ifdef CONFIG_PREEMPT 204#define preempt_check_resched() do { } while (0)
88 205#endif /* CONFIG_PREEMPT */
89#ifndef CONFIG_CONTEXT_TRACKING
90#define __preempt_schedule_context() __preempt_schedule()
91#endif
92 206
93#define preempt_enable_notrace() \ 207#define preempt_disable_notrace() \
94do { \ 208do { \
209 __preempt_count_inc(); \
95 barrier(); \ 210 barrier(); \
96 if (unlikely(__preempt_count_dec_and_test())) \
97 __preempt_schedule_context(); \
98} while (0) 211} while (0)
99#else 212
100#define preempt_enable_notrace() \ 213#define preempt_enable_no_resched_notrace() \
101do { \ 214do { \
102 barrier(); \ 215 barrier(); \
103 __preempt_count_dec(); \ 216 __preempt_count_dec(); \
104} while (0) 217} while (0)
105#endif
106 218
107#else /* !CONFIG_PREEMPT_COUNT */ 219#else /* !CONFIG_PREEMPT_COUNT */
108 220
@@ -121,6 +233,7 @@ do { \
121#define preempt_disable_notrace() barrier() 233#define preempt_disable_notrace() barrier()
122#define preempt_enable_no_resched_notrace() barrier() 234#define preempt_enable_no_resched_notrace() barrier()
123#define preempt_enable_notrace() barrier() 235#define preempt_enable_notrace() barrier()
236#define preemptible() 0
124 237
125#endif /* CONFIG_PREEMPT_COUNT */ 238#endif /* CONFIG_PREEMPT_COUNT */
126 239
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
deleted file mode 100644
index dbeec4d4a3be..000000000000
--- a/include/linux/preempt_mask.h
+++ /dev/null
@@ -1,117 +0,0 @@
1#ifndef LINUX_PREEMPT_MASK_H
2#define LINUX_PREEMPT_MASK_H
3
4#include <linux/preempt.h>
5
6/*
7 * We put the hardirq and softirq counter into the preemption
8 * counter. The bitmask has the following meaning:
9 *
10 * - bits 0-7 are the preemption count (max preemption depth: 256)
11 * - bits 8-15 are the softirq count (max # of softirqs: 256)
12 *
13 * The hardirq count could in theory be the same as the number of
14 * interrupts in the system, but we run all interrupt handlers with
15 * interrupts disabled, so we cannot have nesting interrupts. Though
16 * there are a few palaeontologic drivers which reenable interrupts in
17 * the handler, so we need more than one bit here.
18 *
19 * PREEMPT_MASK: 0x000000ff
20 * SOFTIRQ_MASK: 0x0000ff00
21 * HARDIRQ_MASK: 0x000f0000
22 * NMI_MASK: 0x00100000
23 * PREEMPT_ACTIVE: 0x00200000
24 */
25#define PREEMPT_BITS 8
26#define SOFTIRQ_BITS 8
27#define HARDIRQ_BITS 4
28#define NMI_BITS 1
29
30#define PREEMPT_SHIFT 0
31#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
32#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
33#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
34
35#define __IRQ_MASK(x) ((1UL << (x))-1)
36
37#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
38#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
39#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
40#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
41
42#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
43#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
44#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
45#define NMI_OFFSET (1UL << NMI_SHIFT)
46
47#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
48
49#define PREEMPT_ACTIVE_BITS 1
50#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
51#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
52
53#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
54#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
55#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
56 | NMI_MASK))
57
58/*
59 * Are we doing bottom half or hardware interrupt processing?
60 * Are we in a softirq context? Interrupt context?
61 * in_softirq - Are we currently processing softirq or have bh disabled?
62 * in_serving_softirq - Are we currently processing softirq?
63 */
64#define in_irq() (hardirq_count())
65#define in_softirq() (softirq_count())
66#define in_interrupt() (irq_count())
67#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
68
69/*
70 * Are we in NMI context?
71 */
72#define in_nmi() (preempt_count() & NMI_MASK)
73
74#if defined(CONFIG_PREEMPT_COUNT)
75# define PREEMPT_CHECK_OFFSET 1
76#else
77# define PREEMPT_CHECK_OFFSET 0
78#endif
79
80/*
81 * The preempt_count offset needed for things like:
82 *
83 * spin_lock_bh()
84 *
85 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
86 * softirqs, such that unlock sequences of:
87 *
88 * spin_unlock();
89 * local_bh_enable();
90 *
91 * Work as expected.
92 */
93#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
94
95/*
96 * Are we running in atomic context? WARNING: this macro cannot
97 * always detect atomic context; in particular, it cannot know about
98 * held spinlocks in non-preemptible kernels. Thus it should not be
99 * used in the general case to determine whether sleeping is possible.
100 * Do not use in_atomic() in driver code.
101 */
102#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
103
104/*
105 * Check whether we were atomic before we did preempt_disable():
106 * (used by the scheduler, *after* releasing the kernel lock)
107 */
108#define in_atomic_preempt_off() \
109 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
110
111#ifdef CONFIG_PREEMPT_COUNT
112# define preemptible() (preempt_count() == 0 && !irqs_disabled())
113#else
114# define preemptible() 0
115#endif
116
117#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 659f5729cacc..d4193d5613cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,7 +25,7 @@ struct sched_param {
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/nodemask.h> 26#include <linux/nodemask.h>
27#include <linux/mm_types.h> 27#include <linux/mm_types.h>
28#include <linux/preempt_mask.h> 28#include <linux/preempt.h>
29 29
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/ptrace.h> 31#include <asm/ptrace.h>
@@ -174,7 +174,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
174extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); 174extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
175 175
176extern void calc_global_load(unsigned long ticks); 176extern void calc_global_load(unsigned long ticks);
177
178#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
177extern void update_cpu_load_nohz(void); 179extern void update_cpu_load_nohz(void);
180#else
181static inline void update_cpu_load_nohz(void) { }
182#endif
178 183
179extern unsigned long get_parent_ip(unsigned long addr); 184extern unsigned long get_parent_ip(unsigned long addr);
180 185
@@ -214,9 +219,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
214#define TASK_WAKEKILL 128 219#define TASK_WAKEKILL 128
215#define TASK_WAKING 256 220#define TASK_WAKING 256
216#define TASK_PARKED 512 221#define TASK_PARKED 512
217#define TASK_STATE_MAX 1024 222#define TASK_NOLOAD 1024
223#define TASK_STATE_MAX 2048
218 224
219#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP" 225#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
220 226
221extern char ___assert_task_state[1 - 2*!!( 227extern char ___assert_task_state[1 - 2*!!(
222 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; 228 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -226,6 +232,8 @@ extern char ___assert_task_state[1 - 2*!!(
226#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) 232#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
227#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) 233#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
228 234
235#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
236
229/* Convenience macros for the sake of wake_up */ 237/* Convenience macros for the sake of wake_up */
230#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) 238#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
231#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) 239#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@ -241,7 +249,8 @@ extern char ___assert_task_state[1 - 2*!!(
241 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 249 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
242#define task_contributes_to_load(task) \ 250#define task_contributes_to_load(task) \
243 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ 251 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
244 (task->flags & PF_FROZEN) == 0) 252 (task->flags & PF_FROZEN) == 0 && \
253 (task->state & TASK_NOLOAD) == 0)
245 254
246#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 255#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
247 256
@@ -568,6 +577,23 @@ struct task_cputime {
568 .sum_exec_runtime = 0, \ 577 .sum_exec_runtime = 0, \
569 } 578 }
570 579
580/*
581 * This is the atomic variant of task_cputime, which can be used for
582 * storing and updating task_cputime statistics without locking.
583 */
584struct task_cputime_atomic {
585 atomic64_t utime;
586 atomic64_t stime;
587 atomic64_t sum_exec_runtime;
588};
589
590#define INIT_CPUTIME_ATOMIC \
591 (struct task_cputime_atomic) { \
592 .utime = ATOMIC64_INIT(0), \
593 .stime = ATOMIC64_INIT(0), \
594 .sum_exec_runtime = ATOMIC64_INIT(0), \
595 }
596
571#ifdef CONFIG_PREEMPT_COUNT 597#ifdef CONFIG_PREEMPT_COUNT
572#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) 598#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
573#else 599#else
@@ -585,18 +611,16 @@ struct task_cputime {
585 611
586/** 612/**
587 * struct thread_group_cputimer - thread group interval timer counts 613 * struct thread_group_cputimer - thread group interval timer counts
588 * @cputime: thread group interval timers. 614 * @cputime_atomic: atomic thread group interval timers.
589 * @running: non-zero when there are timers running and 615 * @running: non-zero when there are timers running and
590 * @cputime receives updates. 616 * @cputime receives updates.
591 * @lock: lock for fields in this struct.
592 * 617 *
593 * This structure contains the version of task_cputime, above, that is 618 * This structure contains the version of task_cputime, above, that is
594 * used for thread group CPU timer calculations. 619 * used for thread group CPU timer calculations.
595 */ 620 */
596struct thread_group_cputimer { 621struct thread_group_cputimer {
597 struct task_cputime cputime; 622 struct task_cputime_atomic cputime_atomic;
598 int running; 623 int running;
599 raw_spinlock_t lock;
600}; 624};
601 625
602#include <linux/rwsem.h> 626#include <linux/rwsem.h>
@@ -901,6 +925,50 @@ enum cpu_idle_type {
901#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) 925#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
902 926
903/* 927/*
928 * Wake-queues are lists of tasks with a pending wakeup, whose
929 * callers have already marked the task as woken internally,
930 * and can thus carry on. A common use case is being able to
931 * do the wakeups once the corresponding user lock as been
932 * released.
933 *
934 * We hold reference to each task in the list across the wakeup,
935 * thus guaranteeing that the memory is still valid by the time
936 * the actual wakeups are performed in wake_up_q().
937 *
938 * One per task suffices, because there's never a need for a task to be
939 * in two wake queues simultaneously; it is forbidden to abandon a task
940 * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
941 * already in a wake queue, the wakeup will happen soon and the second
942 * waker can just skip it.
943 *
944 * The WAKE_Q macro declares and initializes the list head.
945 * wake_up_q() does NOT reinitialize the list; it's expected to be
946 * called near the end of a function, where the fact that the queue is
947 * not used again will be easy to see by inspection.
948 *
949 * Note that this can cause spurious wakeups. schedule() callers
950 * must ensure the call is done inside a loop, confirming that the
951 * wakeup condition has in fact occurred.
952 */
953struct wake_q_node {
954 struct wake_q_node *next;
955};
956
957struct wake_q_head {
958 struct wake_q_node *first;
959 struct wake_q_node **lastp;
960};
961
962#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
963
964#define WAKE_Q(name) \
965 struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
966
967extern void wake_q_add(struct wake_q_head *head,
968 struct task_struct *task);
969extern void wake_up_q(struct wake_q_head *head);
970
971/*
904 * sched-domains (multiprocessor balancing) declarations: 972 * sched-domains (multiprocessor balancing) declarations:
905 */ 973 */
906#ifdef CONFIG_SMP 974#ifdef CONFIG_SMP
@@ -1335,8 +1403,6 @@ struct task_struct {
1335 int rcu_read_lock_nesting; 1403 int rcu_read_lock_nesting;
1336 union rcu_special rcu_read_unlock_special; 1404 union rcu_special rcu_read_unlock_special;
1337 struct list_head rcu_node_entry; 1405 struct list_head rcu_node_entry;
1338#endif /* #ifdef CONFIG_PREEMPT_RCU */
1339#ifdef CONFIG_PREEMPT_RCU
1340 struct rcu_node *rcu_blocked_node; 1406 struct rcu_node *rcu_blocked_node;
1341#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1407#endif /* #ifdef CONFIG_PREEMPT_RCU */
1342#ifdef CONFIG_TASKS_RCU 1408#ifdef CONFIG_TASKS_RCU
@@ -1367,7 +1433,7 @@ struct task_struct {
1367 int exit_state; 1433 int exit_state;
1368 int exit_code, exit_signal; 1434 int exit_code, exit_signal;
1369 int pdeath_signal; /* The signal sent when the parent dies */ 1435 int pdeath_signal; /* The signal sent when the parent dies */
1370 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1436 unsigned long jobctl; /* JOBCTL_*, siglock protected */
1371 1437
1372 /* Used for emulating ABI behavior of previous Linux versions */ 1438 /* Used for emulating ABI behavior of previous Linux versions */
1373 unsigned int personality; 1439 unsigned int personality;
@@ -1513,6 +1579,8 @@ struct task_struct {
1513 /* Protection of the PI data structures: */ 1579 /* Protection of the PI data structures: */
1514 raw_spinlock_t pi_lock; 1580 raw_spinlock_t pi_lock;
1515 1581
1582 struct wake_q_node wake_q;
1583
1516#ifdef CONFIG_RT_MUTEXES 1584#ifdef CONFIG_RT_MUTEXES
1517 /* PI waiters blocked on a rt_mutex held by this task */ 1585 /* PI waiters blocked on a rt_mutex held by this task */
1518 struct rb_root pi_waiters; 1586 struct rb_root pi_waiters;
@@ -1726,6 +1794,7 @@ struct task_struct {
1726#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 1794#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1727 unsigned long task_state_change; 1795 unsigned long task_state_change;
1728#endif 1796#endif
1797 int pagefault_disabled;
1729}; 1798};
1730 1799
1731/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1800/* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2079,22 +2148,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
2079#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ 2148#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
2080#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ 2149#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
2081 2150
2082#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) 2151#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
2083#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) 2152#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
2084#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) 2153#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
2085#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) 2154#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT)
2086#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) 2155#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
2087#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) 2156#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
2088#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) 2157#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
2089 2158
2090#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) 2159#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
2091#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) 2160#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
2092 2161
2093extern bool task_set_jobctl_pending(struct task_struct *task, 2162extern bool task_set_jobctl_pending(struct task_struct *task,
2094 unsigned int mask); 2163 unsigned long mask);
2095extern void task_clear_jobctl_trapping(struct task_struct *task); 2164extern void task_clear_jobctl_trapping(struct task_struct *task);
2096extern void task_clear_jobctl_pending(struct task_struct *task, 2165extern void task_clear_jobctl_pending(struct task_struct *task,
2097 unsigned int mask); 2166 unsigned long mask);
2098 2167
2099static inline void rcu_copy_process(struct task_struct *p) 2168static inline void rcu_copy_process(struct task_struct *p)
2100{ 2169{
@@ -2964,11 +3033,6 @@ static __always_inline bool need_resched(void)
2964void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); 3033void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
2965void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); 3034void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
2966 3035
2967static inline void thread_group_cputime_init(struct signal_struct *sig)
2968{
2969 raw_spin_lock_init(&sig->cputimer.lock);
2970}
2971
2972/* 3036/*
2973 * Reevaluate whether the task has signals pending delivery. 3037 * Reevaluate whether the task has signals pending delivery.
2974 * Wake the task if so. 3038 * Wake the task if so.
@@ -3082,13 +3146,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
3082static inline unsigned long task_rlimit(const struct task_struct *tsk, 3146static inline unsigned long task_rlimit(const struct task_struct *tsk,
3083 unsigned int limit) 3147 unsigned int limit)
3084{ 3148{
3085 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); 3149 return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
3086} 3150}
3087 3151
3088static inline unsigned long task_rlimit_max(const struct task_struct *tsk, 3152static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
3089 unsigned int limit) 3153 unsigned int limit)
3090{ 3154{
3091 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); 3155 return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
3092} 3156}
3093 3157
3094static inline unsigned long rlimit(unsigned int limit) 3158static inline unsigned long rlimit(unsigned int limit)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 909b6e43b694..73ddad1e0fa3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu)
191#ifndef topology_core_id 191#ifndef topology_core_id
192#define topology_core_id(cpu) ((void)(cpu), 0) 192#define topology_core_id(cpu) ((void)(cpu), 0)
193#endif 193#endif
194#ifndef topology_thread_cpumask 194#ifndef topology_sibling_cpumask
195#define topology_thread_cpumask(cpu) cpumask_of(cpu) 195#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
196#endif 196#endif
197#ifndef topology_core_cpumask 197#ifndef topology_core_cpumask
198#define topology_core_cpumask(cpu) cpumask_of(cpu) 198#define topology_core_cpumask(cpu) cpumask_of(cpu)
@@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu)
201#ifdef CONFIG_SCHED_SMT 201#ifdef CONFIG_SCHED_SMT
202static inline const struct cpumask *cpu_smt_mask(int cpu) 202static inline const struct cpumask *cpu_smt_mask(int cpu)
203{ 203{
204 return topology_thread_cpumask(cpu); 204 return topology_sibling_cpumask(cpu);
205} 205}
206#endif 206#endif
207 207
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac33..ae572c138607 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,21 +1,30 @@
1#ifndef __LINUX_UACCESS_H__ 1#ifndef __LINUX_UACCESS_H__
2#define __LINUX_UACCESS_H__ 2#define __LINUX_UACCESS_H__
3 3
4#include <linux/preempt.h> 4#include <linux/sched.h>
5#include <asm/uaccess.h> 5#include <asm/uaccess.h>
6 6
7static __always_inline void pagefault_disabled_inc(void)
8{
9 current->pagefault_disabled++;
10}
11
12static __always_inline void pagefault_disabled_dec(void)
13{
14 current->pagefault_disabled--;
15 WARN_ON(current->pagefault_disabled < 0);
16}
17
7/* 18/*
8 * These routines enable/disable the pagefault handler in that 19 * These routines enable/disable the pagefault handler. If disabled, it will
9 * it will not take any locks and go straight to the fixup table. 20 * not take any locks and go straight to the fixup table.
10 * 21 *
11 * They have great resemblance to the preempt_disable/enable calls 22 * User access methods will not sleep when called from a pagefault_disabled()
12 * and in fact they are identical; this is because currently there is 23 * environment.
13 * no other way to make the pagefault handlers do this. So we do
14 * disable preemption but we don't necessarily care about that.
15 */ 24 */
16static inline void pagefault_disable(void) 25static inline void pagefault_disable(void)
17{ 26{
18 preempt_count_inc(); 27 pagefault_disabled_inc();
19 /* 28 /*
20 * make sure to have issued the store before a pagefault 29 * make sure to have issued the store before a pagefault
21 * can hit. 30 * can hit.
@@ -25,18 +34,31 @@ static inline void pagefault_disable(void)
25 34
26static inline void pagefault_enable(void) 35static inline void pagefault_enable(void)
27{ 36{
28#ifndef CONFIG_PREEMPT
29 /* 37 /*
30 * make sure to issue those last loads/stores before enabling 38 * make sure to issue those last loads/stores before enabling
31 * the pagefault handler again. 39 * the pagefault handler again.
32 */ 40 */
33 barrier(); 41 barrier();
34 preempt_count_dec(); 42 pagefault_disabled_dec();
35#else
36 preempt_enable();
37#endif
38} 43}
39 44
45/*
46 * Is the pagefault handler disabled? If so, user access methods will not sleep.
47 */
48#define pagefault_disabled() (current->pagefault_disabled != 0)
49
50/*
51 * The pagefault handler is in general disabled by pagefault_disable() or
52 * when in irq context (via in_atomic()).
53 *
54 * This function should only be used by the fault handlers. Other users should
55 * stick to pagefault_disabled().
56 * Please NEVER use preempt_disable() to disable the fault handler. With
57 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
58 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
59 */
60#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
61
40#ifndef ARCH_HAS_NOCACHE_UACCESS 62#ifndef ARCH_HAS_NOCACHE_UACCESS
41 63
42static inline unsigned long __copy_from_user_inatomic_nocache(void *to, 64static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db83349865b..d69ac4ecc88b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
969 * on that signal. 969 * on that signal.
970 */ 970 */
971static inline int 971static inline int
972wait_on_bit(void *word, int bit, unsigned mode) 972wait_on_bit(unsigned long *word, int bit, unsigned mode)
973{ 973{
974 might_sleep(); 974 might_sleep();
975 if (!test_bit(bit, word)) 975 if (!test_bit(bit, word))
@@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
994 * on that signal. 994 * on that signal.
995 */ 995 */
996static inline int 996static inline int
997wait_on_bit_io(void *word, int bit, unsigned mode) 997wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
998{ 998{
999 might_sleep(); 999 might_sleep();
1000 if (!test_bit(bit, word)) 1000 if (!test_bit(bit, word))
@@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
1020 * received a signal and the mode permitted wakeup on that signal. 1020 * received a signal and the mode permitted wakeup on that signal.
1021 */ 1021 */
1022static inline int 1022static inline int
1023wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) 1023wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
1024 unsigned long timeout)
1024{ 1025{
1025 might_sleep(); 1026 might_sleep();
1026 if (!test_bit(bit, word)) 1027 if (!test_bit(bit, word))
@@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
1047 * on that signal. 1048 * on that signal.
1048 */ 1049 */
1049static inline int 1050static inline int
1050wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) 1051wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
1052 unsigned mode)
1051{ 1053{
1052 might_sleep(); 1054 might_sleep();
1053 if (!test_bit(bit, word)) 1055 if (!test_bit(bit, word))
@@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
1075 * the @mode allows that signal to wake the process. 1077 * the @mode allows that signal to wake the process.
1076 */ 1078 */
1077static inline int 1079static inline int
1078wait_on_bit_lock(void *word, int bit, unsigned mode) 1080wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
1079{ 1081{
1080 might_sleep(); 1082 might_sleep();
1081 if (!test_and_set_bit(bit, word)) 1083 if (!test_and_set_bit(bit, word))
@@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
1099 * the @mode allows that signal to wake the process. 1101 * the @mode allows that signal to wake the process.
1100 */ 1102 */
1101static inline int 1103static inline int
1102wait_on_bit_lock_io(void *word, int bit, unsigned mode) 1104wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
1103{ 1105{
1104 might_sleep(); 1106 might_sleep();
1105 if (!test_and_set_bit(bit, word)) 1107 if (!test_and_set_bit(bit, word))
@@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
1125 * the @mode allows that signal to wake the process. 1127 * the @mode allows that signal to wake the process.
1126 */ 1128 */
1127static inline int 1129static inline int
1128wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) 1130wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
1131 unsigned mode)
1129{ 1132{
1130 might_sleep(); 1133 might_sleep();
1131 if (!test_and_set_bit(bit, word)) 1134 if (!test_and_set_bit(bit, word))
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf3e56a..d57a575fe31f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,8 @@ TRACE_EVENT(sched_switch,
147 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", 147 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
148 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, 148 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
149 { 16, "Z" }, { 32, "X" }, { 64, "x" }, 149 { 16, "Z" }, { 32, "X" }, { 64, "x" },
150 { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", 150 { 128, "K" }, { 256, "W" }, { 512, "P" },
151 { 1024, "N" }) : "R",
151 __entry->prev_state & TASK_STATE_MAX ? "+" : "", 152 __entry->prev_state & TASK_STATE_MAX ? "+" : "",
152 __entry->next_comm, __entry->next_pid, __entry->next_prio) 153 __entry->next_comm, __entry->next_pid, __entry->next_prio)
153); 154);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3aaea7ffd077..a24ba9fe5bb8 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,8 +47,7 @@
47#define RECV 1 47#define RECV 1
48 48
49#define STATE_NONE 0 49#define STATE_NONE 0
50#define STATE_PENDING 1 50#define STATE_READY 1
51#define STATE_READY 2
52 51
53struct posix_msg_tree_node { 52struct posix_msg_tree_node {
54 struct rb_node rb_node; 53 struct rb_node rb_node;
@@ -571,15 +570,12 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
571 wq_add(info, sr, ewp); 570 wq_add(info, sr, ewp);
572 571
573 for (;;) { 572 for (;;) {
574 set_current_state(TASK_INTERRUPTIBLE); 573 __set_current_state(TASK_INTERRUPTIBLE);
575 574
576 spin_unlock(&info->lock); 575 spin_unlock(&info->lock);
577 time = schedule_hrtimeout_range_clock(timeout, 0, 576 time = schedule_hrtimeout_range_clock(timeout, 0,
578 HRTIMER_MODE_ABS, CLOCK_REALTIME); 577 HRTIMER_MODE_ABS, CLOCK_REALTIME);
579 578
580 while (ewp->state == STATE_PENDING)
581 cpu_relax();
582
583 if (ewp->state == STATE_READY) { 579 if (ewp->state == STATE_READY) {
584 retval = 0; 580 retval = 0;
585 goto out; 581 goto out;
@@ -907,11 +903,15 @@ out_name:
907 * list of waiting receivers. A sender checks that list before adding the new 903 * list of waiting receivers. A sender checks that list before adding the new
908 * message into the message array. If there is a waiting receiver, then it 904 * message into the message array. If there is a waiting receiver, then it
909 * bypasses the message array and directly hands the message over to the 905 * bypasses the message array and directly hands the message over to the
910 * receiver. 906 * receiver. The receiver accepts the message and returns without grabbing the
911 * The receiver accepts the message and returns without grabbing the queue 907 * queue spinlock:
912 * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers 908 *
913 * are necessary. The same algorithm is used for sysv semaphores, see 909 * - Set pointer to message.
914 * ipc/sem.c for more details. 910 * - Queue the receiver task for later wakeup (without the info->lock).
911 * - Update its state to STATE_READY. Now the receiver can continue.
912 * - Wake up the process after the lock is dropped. Should the process wake up
913 * before this wakeup (due to a timeout or a signal) it will either see
914 * STATE_READY and continue or acquire the lock to check the state again.
915 * 915 *
916 * The same algorithm is used for senders. 916 * The same algorithm is used for senders.
917 */ 917 */
@@ -919,21 +919,29 @@ out_name:
919/* pipelined_send() - send a message directly to the task waiting in 919/* pipelined_send() - send a message directly to the task waiting in
920 * sys_mq_timedreceive() (without inserting message into a queue). 920 * sys_mq_timedreceive() (without inserting message into a queue).
921 */ 921 */
922static inline void pipelined_send(struct mqueue_inode_info *info, 922static inline void pipelined_send(struct wake_q_head *wake_q,
923 struct mqueue_inode_info *info,
923 struct msg_msg *message, 924 struct msg_msg *message,
924 struct ext_wait_queue *receiver) 925 struct ext_wait_queue *receiver)
925{ 926{
926 receiver->msg = message; 927 receiver->msg = message;
927 list_del(&receiver->list); 928 list_del(&receiver->list);
928 receiver->state = STATE_PENDING; 929 wake_q_add(wake_q, receiver->task);
929 wake_up_process(receiver->task); 930 /*
930 smp_wmb(); 931 * Rely on the implicit cmpxchg barrier from wake_q_add such
932 * that we can ensure that updating receiver->state is the last
933 * write operation: As once set, the receiver can continue,
934 * and if we don't have the reference count from the wake_q,
935 * yet, at that point we can later have a use-after-free
936 * condition and bogus wakeup.
937 */
931 receiver->state = STATE_READY; 938 receiver->state = STATE_READY;
932} 939}
933 940
934/* pipelined_receive() - if there is task waiting in sys_mq_timedsend() 941/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
935 * gets its message and put to the queue (we have one free place for sure). */ 942 * gets its message and put to the queue (we have one free place for sure). */
936static inline void pipelined_receive(struct mqueue_inode_info *info) 943static inline void pipelined_receive(struct wake_q_head *wake_q,
944 struct mqueue_inode_info *info)
937{ 945{
938 struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND); 946 struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
939 947
@@ -944,10 +952,9 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
944 } 952 }
945 if (msg_insert(sender->msg, info)) 953 if (msg_insert(sender->msg, info))
946 return; 954 return;
955
947 list_del(&sender->list); 956 list_del(&sender->list);
948 sender->state = STATE_PENDING; 957 wake_q_add(wake_q, sender->task);
949 wake_up_process(sender->task);
950 smp_wmb();
951 sender->state = STATE_READY; 958 sender->state = STATE_READY;
952} 959}
953 960
@@ -965,6 +972,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
965 struct timespec ts; 972 struct timespec ts;
966 struct posix_msg_tree_node *new_leaf = NULL; 973 struct posix_msg_tree_node *new_leaf = NULL;
967 int ret = 0; 974 int ret = 0;
975 WAKE_Q(wake_q);
968 976
969 if (u_abs_timeout) { 977 if (u_abs_timeout) {
970 int res = prepare_timeout(u_abs_timeout, &expires, &ts); 978 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -1049,7 +1057,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1049 } else { 1057 } else {
1050 receiver = wq_get_first_waiter(info, RECV); 1058 receiver = wq_get_first_waiter(info, RECV);
1051 if (receiver) { 1059 if (receiver) {
1052 pipelined_send(info, msg_ptr, receiver); 1060 pipelined_send(&wake_q, info, msg_ptr, receiver);
1053 } else { 1061 } else {
1054 /* adds message to the queue */ 1062 /* adds message to the queue */
1055 ret = msg_insert(msg_ptr, info); 1063 ret = msg_insert(msg_ptr, info);
@@ -1062,6 +1070,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1062 } 1070 }
1063out_unlock: 1071out_unlock:
1064 spin_unlock(&info->lock); 1072 spin_unlock(&info->lock);
1073 wake_up_q(&wake_q);
1065out_free: 1074out_free:
1066 if (ret) 1075 if (ret)
1067 free_msg(msg_ptr); 1076 free_msg(msg_ptr);
@@ -1149,14 +1158,17 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1149 msg_ptr = wait.msg; 1158 msg_ptr = wait.msg;
1150 } 1159 }
1151 } else { 1160 } else {
1161 WAKE_Q(wake_q);
1162
1152 msg_ptr = msg_get(info); 1163 msg_ptr = msg_get(info);
1153 1164
1154 inode->i_atime = inode->i_mtime = inode->i_ctime = 1165 inode->i_atime = inode->i_mtime = inode->i_ctime =
1155 CURRENT_TIME; 1166 CURRENT_TIME;
1156 1167
1157 /* There is now free space in queue. */ 1168 /* There is now free space in queue. */
1158 pipelined_receive(info); 1169 pipelined_receive(&wake_q, info);
1159 spin_unlock(&info->lock); 1170 spin_unlock(&info->lock);
1171 wake_up_q(&wake_q);
1160 ret = 0; 1172 ret = 0;
1161 } 1173 }
1162 if (ret == 0) { 1174 if (ret == 0) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
1091{ 1091{
1092 unsigned long cpu_limit; 1092 unsigned long cpu_limit;
1093 1093
1094 /* Thread group counters. */ 1094 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1095 thread_group_cputime_init(sig);
1096
1097 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1098 if (cpu_limit != RLIM_INFINITY) { 1095 if (cpu_limit != RLIM_INFINITY) {
1099 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); 1096 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
1100 sig->cputimer.running = 1; 1097 sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1396 p->hardirq_context = 0; 1393 p->hardirq_context = 0;
1397 p->softirq_context = 0; 1394 p->softirq_context = 0;
1398#endif 1395#endif
1396
1397 p->pagefault_disabled = 0;
1398
1399#ifdef CONFIG_LOCKDEP 1399#ifdef CONFIG_LOCKDEP
1400 p->lockdep_depth = 0; /* no locks held yet */ 1400 p->lockdep_depth = 0; /* no locks held yet */
1401 p->curr_chain_key = 0; 1401 p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 55ca63ad9622..aacc706f85fc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
1090 1090
1091/* 1091/*
1092 * The hash bucket lock must be held when this is called. 1092 * The hash bucket lock must be held when this is called.
1093 * Afterwards, the futex_q must not be accessed. 1093 * Afterwards, the futex_q must not be accessed. Callers
1094 * must ensure to later call wake_up_q() for the actual
1095 * wakeups to occur.
1094 */ 1096 */
1095static void wake_futex(struct futex_q *q) 1097static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1096{ 1098{
1097 struct task_struct *p = q->task; 1099 struct task_struct *p = q->task;
1098 1100
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
1100 return; 1102 return;
1101 1103
1102 /* 1104 /*
1103 * We set q->lock_ptr = NULL _before_ we wake up the task. If 1105 * Queue the task for later wakeup for after we've released
1104 * a non-futex wake up happens on another CPU then the task 1106 * the hb->lock. wake_q_add() grabs reference to p.
1105 * might exit and p would dereference a non-existing task
1106 * struct. Prevent this by holding a reference on p across the
1107 * wake up.
1108 */ 1107 */
1109 get_task_struct(p); 1108 wake_q_add(wake_q, p);
1110
1111 __unqueue_futex(q); 1109 __unqueue_futex(q);
1112 /* 1110 /*
1113 * The waiting task can free the futex_q as soon as 1111 * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
1117 */ 1115 */
1118 smp_wmb(); 1116 smp_wmb();
1119 q->lock_ptr = NULL; 1117 q->lock_ptr = NULL;
1120
1121 wake_up_state(p, TASK_NORMAL);
1122 put_task_struct(p);
1123} 1118}
1124 1119
1125static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 1120static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1217 struct futex_q *this, *next; 1212 struct futex_q *this, *next;
1218 union futex_key key = FUTEX_KEY_INIT; 1213 union futex_key key = FUTEX_KEY_INIT;
1219 int ret; 1214 int ret;
1215 WAKE_Q(wake_q);
1220 1216
1221 if (!bitset) 1217 if (!bitset)
1222 return -EINVAL; 1218 return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1244 if (!(this->bitset & bitset)) 1240 if (!(this->bitset & bitset))
1245 continue; 1241 continue;
1246 1242
1247 wake_futex(this); 1243 mark_wake_futex(&wake_q, this);
1248 if (++ret >= nr_wake) 1244 if (++ret >= nr_wake)
1249 break; 1245 break;
1250 } 1246 }
1251 } 1247 }
1252 1248
1253 spin_unlock(&hb->lock); 1249 spin_unlock(&hb->lock);
1250 wake_up_q(&wake_q);
1254out_put_key: 1251out_put_key:
1255 put_futex_key(&key); 1252 put_futex_key(&key);
1256out: 1253out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1269 struct futex_hash_bucket *hb1, *hb2; 1266 struct futex_hash_bucket *hb1, *hb2;
1270 struct futex_q *this, *next; 1267 struct futex_q *this, *next;
1271 int ret, op_ret; 1268 int ret, op_ret;
1269 WAKE_Q(wake_q);
1272 1270
1273retry: 1271retry:
1274 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1272 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
1320 ret = -EINVAL; 1318 ret = -EINVAL;
1321 goto out_unlock; 1319 goto out_unlock;
1322 } 1320 }
1323 wake_futex(this); 1321 mark_wake_futex(&wake_q, this);
1324 if (++ret >= nr_wake) 1322 if (++ret >= nr_wake)
1325 break; 1323 break;
1326 } 1324 }
@@ -1334,7 +1332,7 @@ retry_private:
1334 ret = -EINVAL; 1332 ret = -EINVAL;
1335 goto out_unlock; 1333 goto out_unlock;
1336 } 1334 }
1337 wake_futex(this); 1335 mark_wake_futex(&wake_q, this);
1338 if (++op_ret >= nr_wake2) 1336 if (++op_ret >= nr_wake2)
1339 break; 1337 break;
1340 } 1338 }
@@ -1344,6 +1342,7 @@ retry_private:
1344 1342
1345out_unlock: 1343out_unlock:
1346 double_unlock_hb(hb1, hb2); 1344 double_unlock_hb(hb1, hb2);
1345 wake_up_q(&wake_q);
1347out_put_keys: 1346out_put_keys:
1348 put_futex_key(&key2); 1347 put_futex_key(&key2);
1349out_put_key1: 1348out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1503 struct futex_pi_state *pi_state = NULL; 1502 struct futex_pi_state *pi_state = NULL;
1504 struct futex_hash_bucket *hb1, *hb2; 1503 struct futex_hash_bucket *hb1, *hb2;
1505 struct futex_q *this, *next; 1504 struct futex_q *this, *next;
1505 WAKE_Q(wake_q);
1506 1506
1507 if (requeue_pi) { 1507 if (requeue_pi) {
1508 /* 1508 /*
@@ -1679,7 +1679,7 @@ retry_private:
1679 * woken by futex_unlock_pi(). 1679 * woken by futex_unlock_pi().
1680 */ 1680 */
1681 if (++task_count <= nr_wake && !requeue_pi) { 1681 if (++task_count <= nr_wake && !requeue_pi) {
1682 wake_futex(this); 1682 mark_wake_futex(&wake_q, this);
1683 continue; 1683 continue;
1684 } 1684 }
1685 1685
@@ -1719,6 +1719,7 @@ retry_private:
1719out_unlock: 1719out_unlock:
1720 free_pi_state(pi_state); 1720 free_pi_state(pi_state);
1721 double_unlock_hb(hb1, hb2); 1721 double_unlock_hb(hb1, hb2);
1722 wake_up_q(&wake_q);
1722 hb_waiters_dec(hb2); 1723 hb_waiters_dec(hb2);
1723 1724
1724 /* 1725 /*
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
60} 60}
61EXPORT_SYMBOL(lg_local_unlock_cpu); 61EXPORT_SYMBOL(lg_local_unlock_cpu);
62 62
63void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
64{
65 BUG_ON(cpu1 == cpu2);
66
67 /* lock in cpu order, just like lg_global_lock */
68 if (cpu2 < cpu1)
69 swap(cpu1, cpu2);
70
71 preempt_disable();
72 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
73 arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
74 arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
75}
76
77void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
78{
79 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
80 arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
81 arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
82 preempt_enable();
83}
84
63void lg_global_lock(struct lglock *lg) 85void lg_global_lock(struct lglock *lg)
64{ 86{
65 int i; 87 int i;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o loadavg.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o idle.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include "sched.h" 1#include "sched.h"
4 2
5#include <linux/proc_fs.h> 3#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
141 139
142 p->signal->autogroup = autogroup_kref_get(ag); 140 p->signal->autogroup = autogroup_kref_get(ag);
143 141
144 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 142 if (!READ_ONCE(sysctl_sched_autogroup_enabled))
145 goto out; 143 goto out;
146 144
147 for_each_thread(p, t) 145 for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
249 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 247 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
250} 248}
251#endif /* CONFIG_SCHED_DEBUG */ 249#endif /* CONFIG_SCHED_DEBUG */
252
253#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
29static inline struct task_group * 29static inline struct task_group *
30autogroup_task_group(struct task_struct *p, struct task_group *tg) 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{ 31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 32 int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
33 33
34 if (enabled && task_wants_autogroup(p, tg)) 34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg; 35 return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db9b10a78d74..f89ca9bcf42a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
511static bool set_nr_if_polling(struct task_struct *p) 511static bool set_nr_if_polling(struct task_struct *p)
512{ 512{
513 struct thread_info *ti = task_thread_info(p); 513 struct thread_info *ti = task_thread_info(p);
514 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); 514 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
515 515
516 for (;;) { 516 for (;;) {
517 if (!(val & _TIF_POLLING_NRFLAG)) 517 if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
541#endif 541#endif
542#endif 542#endif
543 543
544void wake_q_add(struct wake_q_head *head, struct task_struct *task)
545{
546 struct wake_q_node *node = &task->wake_q;
547
548 /*
549 * Atomically grab the task, if ->wake_q is !nil already it means
550 * its already queued (either by us or someone else) and will get the
551 * wakeup due to that.
552 *
553 * This cmpxchg() implies a full barrier, which pairs with the write
554 * barrier implied by the wakeup in wake_up_list().
555 */
556 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
557 return;
558
559 get_task_struct(task);
560
561 /*
562 * The head is context local, there can be no concurrency.
563 */
564 *head->lastp = node;
565 head->lastp = &node->next;
566}
567
568void wake_up_q(struct wake_q_head *head)
569{
570 struct wake_q_node *node = head->first;
571
572 while (node != WAKE_Q_TAIL) {
573 struct task_struct *task;
574
575 task = container_of(node, struct task_struct, wake_q);
576 BUG_ON(!task);
577 /* task can safely be re-inserted now */
578 node = node->next;
579 task->wake_q.next = NULL;
580
581 /*
582 * wake_up_process() implies a wmb() to pair with the queueing
583 * in wake_q_add() so as not to miss wakeups.
584 */
585 wake_up_process(task);
586 put_task_struct(task);
587 }
588}
589
544/* 590/*
545 * resched_curr - mark rq's current task 'to be rescheduled now'. 591 * resched_curr - mark rq's current task 'to be rescheduled now'.
546 * 592 *
@@ -2105,12 +2151,15 @@ void wake_up_new_task(struct task_struct *p)
2105 2151
2106#ifdef CONFIG_PREEMPT_NOTIFIERS 2152#ifdef CONFIG_PREEMPT_NOTIFIERS
2107 2153
2154static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2155
2108/** 2156/**
2109 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2157 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2110 * @notifier: notifier struct to register 2158 * @notifier: notifier struct to register
2111 */ 2159 */
2112void preempt_notifier_register(struct preempt_notifier *notifier) 2160void preempt_notifier_register(struct preempt_notifier *notifier)
2113{ 2161{
2162 static_key_slow_inc(&preempt_notifier_key);
2114 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2163 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2115} 2164}
2116EXPORT_SYMBOL_GPL(preempt_notifier_register); 2165EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,15 +2168,16 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
2119 * preempt_notifier_unregister - no longer interested in preemption notifications 2168 * preempt_notifier_unregister - no longer interested in preemption notifications
2120 * @notifier: notifier struct to unregister 2169 * @notifier: notifier struct to unregister
2121 * 2170 *
2122 * This is safe to call from within a preemption notifier. 2171 * This is *not* safe to call from within a preemption notifier.
2123 */ 2172 */
2124void preempt_notifier_unregister(struct preempt_notifier *notifier) 2173void preempt_notifier_unregister(struct preempt_notifier *notifier)
2125{ 2174{
2126 hlist_del(&notifier->link); 2175 hlist_del(&notifier->link);
2176 static_key_slow_dec(&preempt_notifier_key);
2127} 2177}
2128EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2178EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2129 2179
2130static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2180static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2131{ 2181{
2132 struct preempt_notifier *notifier; 2182 struct preempt_notifier *notifier;
2133 2183
@@ -2135,9 +2185,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2135 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2185 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2136} 2186}
2137 2187
2188static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2189{
2190 if (static_key_false(&preempt_notifier_key))
2191 __fire_sched_in_preempt_notifiers(curr);
2192}
2193
2138static void 2194static void
2139fire_sched_out_preempt_notifiers(struct task_struct *curr, 2195__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2140 struct task_struct *next) 2196 struct task_struct *next)
2141{ 2197{
2142 struct preempt_notifier *notifier; 2198 struct preempt_notifier *notifier;
2143 2199
@@ -2145,13 +2201,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2145 notifier->ops->sched_out(notifier, next); 2201 notifier->ops->sched_out(notifier, next);
2146} 2202}
2147 2203
2204static __always_inline void
2205fire_sched_out_preempt_notifiers(struct task_struct *curr,
2206 struct task_struct *next)
2207{
2208 if (static_key_false(&preempt_notifier_key))
2209 __fire_sched_out_preempt_notifiers(curr, next);
2210}
2211
2148#else /* !CONFIG_PREEMPT_NOTIFIERS */ 2212#else /* !CONFIG_PREEMPT_NOTIFIERS */
2149 2213
2150static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2214static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2151{ 2215{
2152} 2216}
2153 2217
2154static void 2218static inline void
2155fire_sched_out_preempt_notifiers(struct task_struct *curr, 2219fire_sched_out_preempt_notifiers(struct task_struct *curr,
2156 struct task_struct *next) 2220 struct task_struct *next)
2157{ 2221{
@@ -2397,9 +2461,9 @@ unsigned long nr_iowait_cpu(int cpu)
2397 2461
2398void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2462void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2399{ 2463{
2400 struct rq *this = this_rq(); 2464 struct rq *rq = this_rq();
2401 *nr_waiters = atomic_read(&this->nr_iowait); 2465 *nr_waiters = atomic_read(&rq->nr_iowait);
2402 *load = this->cpu_load[0]; 2466 *load = rq->load.weight;
2403} 2467}
2404 2468
2405#ifdef CONFIG_SMP 2469#ifdef CONFIG_SMP
@@ -2497,6 +2561,7 @@ void scheduler_tick(void)
2497 update_rq_clock(rq); 2561 update_rq_clock(rq);
2498 curr->sched_class->task_tick(rq, curr, 0); 2562 curr->sched_class->task_tick(rq, curr, 0);
2499 update_cpu_load_active(rq); 2563 update_cpu_load_active(rq);
2564 calc_global_load_tick(rq);
2500 raw_spin_unlock(&rq->lock); 2565 raw_spin_unlock(&rq->lock);
2501 2566
2502 perf_event_task_tick(); 2567 perf_event_task_tick();
@@ -2525,7 +2590,7 @@ void scheduler_tick(void)
2525u64 scheduler_tick_max_deferment(void) 2590u64 scheduler_tick_max_deferment(void)
2526{ 2591{
2527 struct rq *rq = this_rq(); 2592 struct rq *rq = this_rq();
2528 unsigned long next, now = ACCESS_ONCE(jiffies); 2593 unsigned long next, now = READ_ONCE(jiffies);
2529 2594
2530 next = rq->last_sched_tick + HZ; 2595 next = rq->last_sched_tick + HZ;
2531 2596
@@ -2726,9 +2791,7 @@ again:
2726 * - return from syscall or exception to user-space 2791 * - return from syscall or exception to user-space
2727 * - return from interrupt-handler to user-space 2792 * - return from interrupt-handler to user-space
2728 * 2793 *
2729 * WARNING: all callers must re-check need_resched() afterward and reschedule 2794 * WARNING: must be called with preemption disabled!
2730 * accordingly in case an event triggered the need for rescheduling (such as
2731 * an interrupt waking up a task) while preemption was disabled in __schedule().
2732 */ 2795 */
2733static void __sched __schedule(void) 2796static void __sched __schedule(void)
2734{ 2797{
@@ -2737,7 +2800,6 @@ static void __sched __schedule(void)
2737 struct rq *rq; 2800 struct rq *rq;
2738 int cpu; 2801 int cpu;
2739 2802
2740 preempt_disable();
2741 cpu = smp_processor_id(); 2803 cpu = smp_processor_id();
2742 rq = cpu_rq(cpu); 2804 rq = cpu_rq(cpu);
2743 rcu_note_context_switch(); 2805 rcu_note_context_switch();
@@ -2801,8 +2863,6 @@ static void __sched __schedule(void)
2801 raw_spin_unlock_irq(&rq->lock); 2863 raw_spin_unlock_irq(&rq->lock);
2802 2864
2803 post_schedule(rq); 2865 post_schedule(rq);
2804
2805 sched_preempt_enable_no_resched();
2806} 2866}
2807 2867
2808static inline void sched_submit_work(struct task_struct *tsk) 2868static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2883,9 @@ asmlinkage __visible void __sched schedule(void)
2823 2883
2824 sched_submit_work(tsk); 2884 sched_submit_work(tsk);
2825 do { 2885 do {
2886 preempt_disable();
2826 __schedule(); 2887 __schedule();
2888 sched_preempt_enable_no_resched();
2827 } while (need_resched()); 2889 } while (need_resched());
2828} 2890}
2829EXPORT_SYMBOL(schedule); 2891EXPORT_SYMBOL(schedule);
@@ -2862,15 +2924,14 @@ void __sched schedule_preempt_disabled(void)
2862static void __sched notrace preempt_schedule_common(void) 2924static void __sched notrace preempt_schedule_common(void)
2863{ 2925{
2864 do { 2926 do {
2865 __preempt_count_add(PREEMPT_ACTIVE); 2927 preempt_active_enter();
2866 __schedule(); 2928 __schedule();
2867 __preempt_count_sub(PREEMPT_ACTIVE); 2929 preempt_active_exit();
2868 2930
2869 /* 2931 /*
2870 * Check again in case we missed a preemption opportunity 2932 * Check again in case we missed a preemption opportunity
2871 * between schedule and now. 2933 * between schedule and now.
2872 */ 2934 */
2873 barrier();
2874 } while (need_resched()); 2935 } while (need_resched());
2875} 2936}
2876 2937
@@ -2894,9 +2955,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2894NOKPROBE_SYMBOL(preempt_schedule); 2955NOKPROBE_SYMBOL(preempt_schedule);
2895EXPORT_SYMBOL(preempt_schedule); 2956EXPORT_SYMBOL(preempt_schedule);
2896 2957
2897#ifdef CONFIG_CONTEXT_TRACKING
2898/** 2958/**
2899 * preempt_schedule_context - preempt_schedule called by tracing 2959 * preempt_schedule_notrace - preempt_schedule called by tracing
2900 * 2960 *
2901 * The tracing infrastructure uses preempt_enable_notrace to prevent 2961 * The tracing infrastructure uses preempt_enable_notrace to prevent
2902 * recursion and tracing preempt enabling caused by the tracing 2962 * recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +2969,7 @@ EXPORT_SYMBOL(preempt_schedule);
2909 * instead of preempt_schedule() to exit user context if needed before 2969 * instead of preempt_schedule() to exit user context if needed before
2910 * calling the scheduler. 2970 * calling the scheduler.
2911 */ 2971 */
2912asmlinkage __visible void __sched notrace preempt_schedule_context(void) 2972asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
2913{ 2973{
2914 enum ctx_state prev_ctx; 2974 enum ctx_state prev_ctx;
2915 2975
@@ -2917,7 +2977,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2917 return; 2977 return;
2918 2978
2919 do { 2979 do {
2920 __preempt_count_add(PREEMPT_ACTIVE); 2980 /*
2981 * Use raw __prempt_count() ops that don't call function.
2982 * We can't call functions before disabling preemption which
2983 * disarm preemption tracing recursions.
2984 */
2985 __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
2986 barrier();
2921 /* 2987 /*
2922 * Needs preempt disabled in case user_exit() is traced 2988 * Needs preempt disabled in case user_exit() is traced
2923 * and the tracer calls preempt_enable_notrace() causing 2989 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +2993,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2927 __schedule(); 2993 __schedule();
2928 exception_exit(prev_ctx); 2994 exception_exit(prev_ctx);
2929 2995
2930 __preempt_count_sub(PREEMPT_ACTIVE);
2931 barrier(); 2996 barrier();
2997 __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
2932 } while (need_resched()); 2998 } while (need_resched());
2933} 2999}
2934EXPORT_SYMBOL_GPL(preempt_schedule_context); 3000EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
2935#endif /* CONFIG_CONTEXT_TRACKING */
2936 3001
2937#endif /* CONFIG_PREEMPT */ 3002#endif /* CONFIG_PREEMPT */
2938 3003
@@ -2952,17 +3017,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
2952 prev_state = exception_enter(); 3017 prev_state = exception_enter();
2953 3018
2954 do { 3019 do {
2955 __preempt_count_add(PREEMPT_ACTIVE); 3020 preempt_active_enter();
2956 local_irq_enable(); 3021 local_irq_enable();
2957 __schedule(); 3022 __schedule();
2958 local_irq_disable(); 3023 local_irq_disable();
2959 __preempt_count_sub(PREEMPT_ACTIVE); 3024 preempt_active_exit();
2960
2961 /*
2962 * Check again in case we missed a preemption opportunity
2963 * between schedule and now.
2964 */
2965 barrier();
2966 } while (need_resched()); 3025 } while (need_resched());
2967 3026
2968 exception_exit(prev_state); 3027 exception_exit(prev_state);
@@ -3040,7 +3099,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3040 if (!dl_prio(p->normal_prio) || 3099 if (!dl_prio(p->normal_prio) ||
3041 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3100 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3042 p->dl.dl_boosted = 1; 3101 p->dl.dl_boosted = 1;
3043 p->dl.dl_throttled = 0;
3044 enqueue_flag = ENQUEUE_REPLENISH; 3102 enqueue_flag = ENQUEUE_REPLENISH;
3045 } else 3103 } else
3046 p->dl.dl_boosted = 0; 3104 p->dl.dl_boosted = 0;
@@ -5314,7 +5372,7 @@ static struct notifier_block migration_notifier = {
5314 .priority = CPU_PRI_MIGRATION, 5372 .priority = CPU_PRI_MIGRATION,
5315}; 5373};
5316 5374
5317static void __cpuinit set_cpu_rq_start_time(void) 5375static void set_cpu_rq_start_time(void)
5318{ 5376{
5319 int cpu = smp_processor_id(); 5377 int cpu = smp_processor_id();
5320 struct rq *rq = cpu_rq(cpu); 5378 struct rq *rq = cpu_rq(cpu);
@@ -7734,11 +7792,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
7734 return rt_runtime_us; 7792 return rt_runtime_us;
7735} 7793}
7736 7794
7737static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7795static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
7738{ 7796{
7739 u64 rt_runtime, rt_period; 7797 u64 rt_runtime, rt_period;
7740 7798
7741 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7799 rt_period = rt_period_us * NSEC_PER_USEC;
7742 rt_runtime = tg->rt_bandwidth.rt_runtime; 7800 rt_runtime = tg->rt_bandwidth.rt_runtime;
7743 7801
7744 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7802 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
567{ 567{
568 cputime_t old; 568 cputime_t old;
569 569
570 while (new > (old = ACCESS_ONCE(*counter))) 570 while (new > (old = READ_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new); 571 cmpxchg_cputime(counter, old, new);
572} 572}
573 573
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..392e8fb94db3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -640,7 +640,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
640} 640}
641 641
642static 642static
643int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) 643int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
644{ 644{
645 return (dl_se->runtime <= 0); 645 return (dl_se->runtime <= 0);
646} 646}
@@ -684,7 +684,7 @@ static void update_curr_dl(struct rq *rq)
684 sched_rt_avg_update(rq, delta_exec); 684 sched_rt_avg_update(rq, delta_exec);
685 685
686 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 686 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
687 if (dl_runtime_exceeded(rq, dl_se)) { 687 if (dl_runtime_exceeded(dl_se)) {
688 dl_se->dl_throttled = 1; 688 dl_se->dl_throttled = 1;
689 __dequeue_task_dl(rq, curr, 0); 689 __dequeue_task_dl(rq, curr, 0);
690 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) 690 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
995 rq = cpu_rq(cpu); 995 rq = cpu_rq(cpu);
996 996
997 rcu_read_lock(); 997 rcu_read_lock();
998 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 998 curr = READ_ONCE(rq->curr); /* unlocked access */
999 999
1000 /* 1000 /*
1001 * If we are dealing with a -deadline task, we must 1001 * If we are dealing with a -deadline task, we must
@@ -1012,7 +1012,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
1012 (p->nr_cpus_allowed > 1)) { 1012 (p->nr_cpus_allowed > 1)) {
1013 int target = find_later_rq(p); 1013 int target = find_later_rq(p);
1014 1014
1015 if (target != -1) 1015 if (target != -1 &&
1016 dl_time_before(p->dl.deadline,
1017 cpu_rq(target)->dl.earliest_dl.curr))
1016 cpu = target; 1018 cpu = target;
1017 } 1019 }
1018 rcu_read_unlock(); 1020 rcu_read_unlock();
@@ -1230,6 +1232,32 @@ next_node:
1230 return NULL; 1232 return NULL;
1231} 1233}
1232 1234
1235/*
1236 * Return the earliest pushable rq's task, which is suitable to be executed
1237 * on the CPU, NULL otherwise:
1238 */
1239static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
1240{
1241 struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
1242 struct task_struct *p = NULL;
1243
1244 if (!has_pushable_dl_tasks(rq))
1245 return NULL;
1246
1247next_node:
1248 if (next_node) {
1249 p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
1250
1251 if (pick_dl_task(rq, p, cpu))
1252 return p;
1253
1254 next_node = rb_next(next_node);
1255 goto next_node;
1256 }
1257
1258 return NULL;
1259}
1260
1233static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); 1261static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1234 1262
1235static int find_later_rq(struct task_struct *task) 1263static int find_later_rq(struct task_struct *task)
@@ -1333,6 +1361,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1333 1361
1334 later_rq = cpu_rq(cpu); 1362 later_rq = cpu_rq(cpu);
1335 1363
1364 if (!dl_time_before(task->dl.deadline,
1365 later_rq->dl.earliest_dl.curr)) {
1366 /*
1367 * Target rq has tasks of equal or earlier deadline,
1368 * retrying does not release any lock and is unlikely
1369 * to yield a different result.
1370 */
1371 later_rq = NULL;
1372 break;
1373 }
1374
1336 /* Retry if something changed. */ 1375 /* Retry if something changed. */
1337 if (double_lock_balance(rq, later_rq)) { 1376 if (double_lock_balance(rq, later_rq)) {
1338 if (unlikely(task_rq(task) != rq || 1377 if (unlikely(task_rq(task) != rq ||
@@ -1514,7 +1553,7 @@ static int pull_dl_task(struct rq *this_rq)
1514 if (src_rq->dl.dl_nr_running <= 1) 1553 if (src_rq->dl.dl_nr_running <= 1)
1515 goto skip; 1554 goto skip;
1516 1555
1517 p = pick_next_earliest_dl_task(src_rq, this_cpu); 1556 p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
1518 1557
1519 /* 1558 /*
1520 * We found a task to be pulled if: 1559 * We found a task to be pulled if:
@@ -1659,7 +1698,7 @@ static void rq_offline_dl(struct rq *rq)
1659 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); 1698 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1660} 1699}
1661 1700
1662void init_sched_dl_class(void) 1701void __init init_sched_dl_class(void)
1663{ 1702{
1664 unsigned int i; 1703 unsigned int i;
1665 1704
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6f0a..704683cc9042 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,12 +132,14 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
132 p->prio); 132 p->prio);
133#ifdef CONFIG_SCHEDSTATS 133#ifdef CONFIG_SCHEDSTATS
134 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 134 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
135 SPLIT_NS(p->se.vruntime), 135 SPLIT_NS(p->se.statistics.wait_sum),
136 SPLIT_NS(p->se.sum_exec_runtime), 136 SPLIT_NS(p->se.sum_exec_runtime),
137 SPLIT_NS(p->se.statistics.sum_sleep_runtime)); 137 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
138#else 138#else
139 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 139 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
140 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 140 0LL, 0L,
141 SPLIT_NS(p->se.sum_exec_runtime),
142 0LL, 0L);
141#endif 143#endif
142#ifdef CONFIG_NUMA_BALANCING 144#ifdef CONFIG_NUMA_BALANCING
143 SEQ_printf(m, " %d", task_node(p)); 145 SEQ_printf(m, " %d", task_node(p));
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
156 SEQ_printf(m, 158 SEQ_printf(m,
157 "\nrunnable tasks:\n" 159 "\nrunnable tasks:\n"
158 " task PID tree-key switches prio" 160 " task PID tree-key switches prio"
159 " exec-runtime sum-exec sum-sleep\n" 161 " wait-time sum-exec sum-sleep\n"
160 "------------------------------------------------------" 162 "------------------------------------------------------"
161 "----------------------------------------------------\n"); 163 "----------------------------------------------------\n");
162 164
@@ -582,6 +584,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
582 nr_switches = p->nvcsw + p->nivcsw; 584 nr_switches = p->nvcsw + p->nivcsw;
583 585
584#ifdef CONFIG_SCHEDSTATS 586#ifdef CONFIG_SCHEDSTATS
587 PN(se.statistics.sum_sleep_runtime);
585 PN(se.statistics.wait_start); 588 PN(se.statistics.wait_start);
586 PN(se.statistics.sleep_start); 589 PN(se.statistics.sleep_start);
587 PN(se.statistics.block_start); 590 PN(se.statistics.block_start);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2980e8733bc..433061d984ea 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
141 * 141 *
142 * This idea comes from the SD scheduler of Con Kolivas: 142 * This idea comes from the SD scheduler of Con Kolivas:
143 */ 143 */
144static int get_update_sysctl_factor(void) 144static unsigned int get_update_sysctl_factor(void)
145{ 145{
146 unsigned int cpus = min_t(int, num_online_cpus(), 8); 146 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 unsigned int factor; 147 unsigned int factor;
148 148
149 switch (sysctl_sched_tunable_scaling) { 149 switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
576 loff_t *ppos) 576 loff_t *ppos)
577{ 577{
578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 int factor = get_update_sysctl_factor(); 579 unsigned int factor = get_update_sysctl_factor();
580 580
581 if (ret || !write) 581 if (ret || !write)
582 return ret; 582 return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
834 834
835static unsigned int task_scan_min(struct task_struct *p) 835static unsigned int task_scan_min(struct task_struct *p)
836{ 836{
837 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); 837 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
838 unsigned int scan, floor; 838 unsigned int scan, floor;
839 unsigned int windows = 1; 839 unsigned int windows = 1;
840 840
@@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env,
1198static bool load_too_imbalanced(long src_load, long dst_load, 1198static bool load_too_imbalanced(long src_load, long dst_load,
1199 struct task_numa_env *env) 1199 struct task_numa_env *env)
1200{ 1200{
1201 long imb, old_imb;
1202 long orig_src_load, orig_dst_load;
1201 long src_capacity, dst_capacity; 1203 long src_capacity, dst_capacity;
1202 long orig_src_load;
1203 long load_a, load_b;
1204 long moved_load;
1205 long imb;
1206 1204
1207 /* 1205 /*
1208 * The load is corrected for the CPU capacity available on each node. 1206 * The load is corrected for the CPU capacity available on each node.
@@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1215 dst_capacity = env->dst_stats.compute_capacity; 1213 dst_capacity = env->dst_stats.compute_capacity;
1216 1214
1217 /* We care about the slope of the imbalance, not the direction. */ 1215 /* We care about the slope of the imbalance, not the direction. */
1218 load_a = dst_load; 1216 if (dst_load < src_load)
1219 load_b = src_load; 1217 swap(dst_load, src_load);
1220 if (load_a < load_b)
1221 swap(load_a, load_b);
1222 1218
1223 /* Is the difference below the threshold? */ 1219 /* Is the difference below the threshold? */
1224 imb = load_a * src_capacity * 100 - 1220 imb = dst_load * src_capacity * 100 -
1225 load_b * dst_capacity * env->imbalance_pct; 1221 src_load * dst_capacity * env->imbalance_pct;
1226 if (imb <= 0) 1222 if (imb <= 0)
1227 return false; 1223 return false;
1228 1224
1229 /* 1225 /*
1230 * The imbalance is above the allowed threshold. 1226 * The imbalance is above the allowed threshold.
1231 * Allow a move that brings us closer to a balanced situation, 1227 * Compare it with the old imbalance.
1232 * without moving things past the point of balance.
1233 */ 1228 */
1234 orig_src_load = env->src_stats.load; 1229 orig_src_load = env->src_stats.load;
1230 orig_dst_load = env->dst_stats.load;
1235 1231
1236 /* 1232 if (orig_dst_load < orig_src_load)
1237 * In a task swap, there will be one load moving from src to dst, 1233 swap(orig_dst_load, orig_src_load);
1238 * and another moving back. This is the net sum of both moves.
1239 * A simple task move will always have a positive value.
1240 * Allow the move if it brings the system closer to a balanced
1241 * situation, without crossing over the balance point.
1242 */
1243 moved_load = orig_src_load - src_load;
1244 1234
1245 if (moved_load > 0) 1235 old_imb = orig_dst_load * src_capacity * 100 -
1246 /* Moving src -> dst. Did we overshoot balance? */ 1236 orig_src_load * dst_capacity * env->imbalance_pct;
1247 return src_load * dst_capacity < dst_load * src_capacity; 1237
1248 else 1238 /* Would this change make things worse? */
1249 /* Moving dst -> src. Did we overshoot balance? */ 1239 return (imb > old_imb);
1250 return dst_load * src_capacity < src_load * dst_capacity;
1251} 1240}
1252 1241
1253/* 1242/*
@@ -1409,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
1409 } 1398 }
1410} 1399}
1411 1400
1401/* Only move tasks to a NUMA node less busy than the current node. */
1402static bool numa_has_capacity(struct task_numa_env *env)
1403{
1404 struct numa_stats *src = &env->src_stats;
1405 struct numa_stats *dst = &env->dst_stats;
1406
1407 if (src->has_free_capacity && !dst->has_free_capacity)
1408 return false;
1409
1410 /*
1411 * Only consider a task move if the source has a higher load
1412 * than the destination, corrected for CPU capacity on each node.
1413 *
1414 * src->load dst->load
1415 * --------------------- vs ---------------------
1416 * src->compute_capacity dst->compute_capacity
1417 */
1418 if (src->load * dst->compute_capacity >
1419 dst->load * src->compute_capacity)
1420 return true;
1421
1422 return false;
1423}
1424
1412static int task_numa_migrate(struct task_struct *p) 1425static int task_numa_migrate(struct task_struct *p)
1413{ 1426{
1414 struct task_numa_env env = { 1427 struct task_numa_env env = {
@@ -1463,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
1463 update_numa_stats(&env.dst_stats, env.dst_nid); 1476 update_numa_stats(&env.dst_stats, env.dst_nid);
1464 1477
1465 /* Try to find a spot on the preferred nid. */ 1478 /* Try to find a spot on the preferred nid. */
1466 task_numa_find_cpu(&env, taskimp, groupimp); 1479 if (numa_has_capacity(&env))
1480 task_numa_find_cpu(&env, taskimp, groupimp);
1467 1481
1468 /* 1482 /*
1469 * Look at other nodes in these cases: 1483 * Look at other nodes in these cases:
@@ -1494,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
1494 env.dist = dist; 1508 env.dist = dist;
1495 env.dst_nid = nid; 1509 env.dst_nid = nid;
1496 update_numa_stats(&env.dst_stats, env.dst_nid); 1510 update_numa_stats(&env.dst_stats, env.dst_nid);
1497 task_numa_find_cpu(&env, taskimp, groupimp); 1511 if (numa_has_capacity(&env))
1512 task_numa_find_cpu(&env, taskimp, groupimp);
1498 } 1513 }
1499 } 1514 }
1500 1515
@@ -1794,7 +1809,12 @@ static void task_numa_placement(struct task_struct *p)
1794 u64 runtime, period; 1809 u64 runtime, period;
1795 spinlock_t *group_lock = NULL; 1810 spinlock_t *group_lock = NULL;
1796 1811
1797 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1812 /*
1813 * The p->mm->numa_scan_seq field gets updated without
1814 * exclusive access. Use READ_ONCE() here to ensure
1815 * that the field is read in a single access:
1816 */
1817 seq = READ_ONCE(p->mm->numa_scan_seq);
1798 if (p->numa_scan_seq == seq) 1818 if (p->numa_scan_seq == seq)
1799 return; 1819 return;
1800 p->numa_scan_seq = seq; 1820 p->numa_scan_seq = seq;
@@ -1938,7 +1958,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1938 } 1958 }
1939 1959
1940 rcu_read_lock(); 1960 rcu_read_lock();
1941 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); 1961 tsk = READ_ONCE(cpu_rq(cpu)->curr);
1942 1962
1943 if (!cpupid_match_pid(tsk, cpupid)) 1963 if (!cpupid_match_pid(tsk, cpupid))
1944 goto no_join; 1964 goto no_join;
@@ -2107,7 +2127,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2107 2127
2108static void reset_ptenuma_scan(struct task_struct *p) 2128static void reset_ptenuma_scan(struct task_struct *p)
2109{ 2129{
2110 ACCESS_ONCE(p->mm->numa_scan_seq)++; 2130 /*
2131 * We only did a read acquisition of the mmap sem, so
2132 * p->mm->numa_scan_seq is written to without exclusive access
2133 * and the update is not guaranteed to be atomic. That's not
2134 * much of an issue though, since this is just used for
2135 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2136 * expensive, to avoid any form of compiler optimizations:
2137 */
2138 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2111 p->mm->numa_scan_offset = 0; 2139 p->mm->numa_scan_offset = 0;
2112} 2140}
2113 2141
@@ -4323,6 +4351,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4323} 4351}
4324 4352
4325#ifdef CONFIG_SMP 4353#ifdef CONFIG_SMP
4354
4355/*
4356 * per rq 'load' arrray crap; XXX kill this.
4357 */
4358
4359/*
4360 * The exact cpuload at various idx values, calculated at every tick would be
4361 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4362 *
4363 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4364 * on nth tick when cpu may be busy, then we have:
4365 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4366 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4367 *
4368 * decay_load_missed() below does efficient calculation of
4369 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4370 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4371 *
4372 * The calculation is approximated on a 128 point scale.
4373 * degrade_zero_ticks is the number of ticks after which load at any
4374 * particular idx is approximated to be zero.
4375 * degrade_factor is a precomputed table, a row for each load idx.
4376 * Each column corresponds to degradation factor for a power of two ticks,
4377 * based on 128 point scale.
4378 * Example:
4379 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4380 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4381 *
4382 * With this power of 2 load factors, we can degrade the load n times
4383 * by looking at 1 bits in n and doing as many mult/shift instead of
4384 * n mult/shifts needed by the exact degradation.
4385 */
4386#define DEGRADE_SHIFT 7
4387static const unsigned char
4388 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4389static const unsigned char
4390 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4391 {0, 0, 0, 0, 0, 0, 0, 0},
4392 {64, 32, 8, 0, 0, 0, 0, 0},
4393 {96, 72, 40, 12, 1, 0, 0},
4394 {112, 98, 75, 43, 15, 1, 0},
4395 {120, 112, 98, 76, 45, 16, 2} };
4396
4397/*
4398 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4399 * would be when CPU is idle and so we just decay the old load without
4400 * adding any new load.
4401 */
4402static unsigned long
4403decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4404{
4405 int j = 0;
4406
4407 if (!missed_updates)
4408 return load;
4409
4410 if (missed_updates >= degrade_zero_ticks[idx])
4411 return 0;
4412
4413 if (idx == 1)
4414 return load >> missed_updates;
4415
4416 while (missed_updates) {
4417 if (missed_updates % 2)
4418 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4419
4420 missed_updates >>= 1;
4421 j++;
4422 }
4423 return load;
4424}
4425
4426/*
4427 * Update rq->cpu_load[] statistics. This function is usually called every
4428 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4429 * every tick. We fix it up based on jiffies.
4430 */
4431static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4432 unsigned long pending_updates)
4433{
4434 int i, scale;
4435
4436 this_rq->nr_load_updates++;
4437
4438 /* Update our load: */
4439 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4440 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4441 unsigned long old_load, new_load;
4442
4443 /* scale is effectively 1 << i now, and >> i divides by scale */
4444
4445 old_load = this_rq->cpu_load[i];
4446 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4447 new_load = this_load;
4448 /*
4449 * Round up the averaging division if load is increasing. This
4450 * prevents us from getting stuck on 9 if the load is 10, for
4451 * example.
4452 */
4453 if (new_load > old_load)
4454 new_load += scale - 1;
4455
4456 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4457 }
4458
4459 sched_avg_update(this_rq);
4460}
4461
4462#ifdef CONFIG_NO_HZ_COMMON
4463/*
4464 * There is no sane way to deal with nohz on smp when using jiffies because the
4465 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4466 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4467 *
4468 * Therefore we cannot use the delta approach from the regular tick since that
4469 * would seriously skew the load calculation. However we'll make do for those
4470 * updates happening while idle (nohz_idle_balance) or coming out of idle
4471 * (tick_nohz_idle_exit).
4472 *
4473 * This means we might still be one tick off for nohz periods.
4474 */
4475
4476/*
4477 * Called from nohz_idle_balance() to update the load ratings before doing the
4478 * idle balance.
4479 */
4480static void update_idle_cpu_load(struct rq *this_rq)
4481{
4482 unsigned long curr_jiffies = READ_ONCE(jiffies);
4483 unsigned long load = this_rq->cfs.runnable_load_avg;
4484 unsigned long pending_updates;
4485
4486 /*
4487 * bail if there's load or we're actually up-to-date.
4488 */
4489 if (load || curr_jiffies == this_rq->last_load_update_tick)
4490 return;
4491
4492 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4493 this_rq->last_load_update_tick = curr_jiffies;
4494
4495 __update_cpu_load(this_rq, load, pending_updates);
4496}
4497
4498/*
4499 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4500 */
4501void update_cpu_load_nohz(void)
4502{
4503 struct rq *this_rq = this_rq();
4504 unsigned long curr_jiffies = READ_ONCE(jiffies);
4505 unsigned long pending_updates;
4506
4507 if (curr_jiffies == this_rq->last_load_update_tick)
4508 return;
4509
4510 raw_spin_lock(&this_rq->lock);
4511 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4512 if (pending_updates) {
4513 this_rq->last_load_update_tick = curr_jiffies;
4514 /*
4515 * We were idle, this means load 0, the current load might be
4516 * !0 due to remote wakeups and the sort.
4517 */
4518 __update_cpu_load(this_rq, 0, pending_updates);
4519 }
4520 raw_spin_unlock(&this_rq->lock);
4521}
4522#endif /* CONFIG_NO_HZ */
4523
4524/*
4525 * Called from scheduler_tick()
4526 */
4527void update_cpu_load_active(struct rq *this_rq)
4528{
4529 unsigned long load = this_rq->cfs.runnable_load_avg;
4530 /*
4531 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4532 */
4533 this_rq->last_load_update_tick = jiffies;
4534 __update_cpu_load(this_rq, load, 1);
4535}
4536
4326/* Used instead of source_load when we know the type == 0 */ 4537/* Used instead of source_load when we know the type == 0 */
4327static unsigned long weighted_cpuload(const int cpu) 4538static unsigned long weighted_cpuload(const int cpu)
4328{ 4539{
@@ -4375,7 +4586,7 @@ static unsigned long capacity_orig_of(int cpu)
4375static unsigned long cpu_avg_load_per_task(int cpu) 4586static unsigned long cpu_avg_load_per_task(int cpu)
4376{ 4587{
4377 struct rq *rq = cpu_rq(cpu); 4588 struct rq *rq = cpu_rq(cpu);
4378 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); 4589 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4379 unsigned long load_avg = rq->cfs.runnable_load_avg; 4590 unsigned long load_avg = rq->cfs.runnable_load_avg;
4380 4591
4381 if (nr_running) 4592 if (nr_running)
@@ -5126,18 +5337,21 @@ again:
5126 * entity, update_curr() will update its vruntime, otherwise 5337 * entity, update_curr() will update its vruntime, otherwise
5127 * forget we've ever seen it. 5338 * forget we've ever seen it.
5128 */ 5339 */
5129 if (curr && curr->on_rq) 5340 if (curr) {
5130 update_curr(cfs_rq); 5341 if (curr->on_rq)
5131 else 5342 update_curr(cfs_rq);
5132 curr = NULL; 5343 else
5344 curr = NULL;
5133 5345
5134 /* 5346 /*
5135 * This call to check_cfs_rq_runtime() will do the throttle and 5347 * This call to check_cfs_rq_runtime() will do the
5136 * dequeue its entity in the parent(s). Therefore the 'simple' 5348 * throttle and dequeue its entity in the parent(s).
5137 * nr_running test will indeed be correct. 5349 * Therefore the 'simple' nr_running test will indeed
5138 */ 5350 * be correct.
5139 if (unlikely(check_cfs_rq_runtime(cfs_rq))) 5351 */
5140 goto simple; 5352 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5353 goto simple;
5354 }
5141 5355
5142 se = pick_next_entity(cfs_rq, curr); 5356 se = pick_next_entity(cfs_rq, curr);
5143 cfs_rq = group_cfs_rq(se); 5357 cfs_rq = group_cfs_rq(se);
@@ -5467,10 +5681,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
5467} 5681}
5468 5682
5469#ifdef CONFIG_NUMA_BALANCING 5683#ifdef CONFIG_NUMA_BALANCING
5470/* Returns true if the destination node has incurred more faults */ 5684/*
5685 * Returns true if the destination node is the preferred node.
5686 * Needs to match fbq_classify_rq(): if there is a runnable task
5687 * that is not on its preferred node, we should identify it.
5688 */
5471static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5689static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5472{ 5690{
5473 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5691 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5692 unsigned long src_faults, dst_faults;
5474 int src_nid, dst_nid; 5693 int src_nid, dst_nid;
5475 5694
5476 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5695 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5703,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5484 if (src_nid == dst_nid) 5703 if (src_nid == dst_nid)
5485 return false; 5704 return false;
5486 5705
5487 if (numa_group) {
5488 /* Task is already in the group's interleave set. */
5489 if (node_isset(src_nid, numa_group->active_nodes))
5490 return false;
5491
5492 /* Task is moving into the group's interleave set. */
5493 if (node_isset(dst_nid, numa_group->active_nodes))
5494 return true;
5495
5496 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5497 }
5498
5499 /* Encourage migration to the preferred node. */ 5706 /* Encourage migration to the preferred node. */
5500 if (dst_nid == p->numa_preferred_nid) 5707 if (dst_nid == p->numa_preferred_nid)
5501 return true; 5708 return true;
5502 5709
5503 return task_faults(p, dst_nid) > task_faults(p, src_nid); 5710 /* Migrating away from the preferred node is bad. */
5711 if (src_nid == p->numa_preferred_nid)
5712 return false;
5713
5714 if (numa_group) {
5715 src_faults = group_faults(p, src_nid);
5716 dst_faults = group_faults(p, dst_nid);
5717 } else {
5718 src_faults = task_faults(p, src_nid);
5719 dst_faults = task_faults(p, dst_nid);
5720 }
5721
5722 return dst_faults > src_faults;
5504} 5723}
5505 5724
5506 5725
5507static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5726static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5508{ 5727{
5509 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5728 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5729 unsigned long src_faults, dst_faults;
5510 int src_nid, dst_nid; 5730 int src_nid, dst_nid;
5511 5731
5512 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5732 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5741,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5521 if (src_nid == dst_nid) 5741 if (src_nid == dst_nid)
5522 return false; 5742 return false;
5523 5743
5524 if (numa_group) { 5744 /* Migrating away from the preferred node is bad. */
5525 /* Task is moving within/into the group's interleave set. */ 5745 if (src_nid == p->numa_preferred_nid)
5526 if (node_isset(dst_nid, numa_group->active_nodes)) 5746 return true;
5527 return false;
5528 5747
5529 /* Task is moving out of the group's interleave set. */ 5748 /* Encourage migration to the preferred node. */
5530 if (node_isset(src_nid, numa_group->active_nodes)) 5749 if (dst_nid == p->numa_preferred_nid)
5531 return true; 5750 return false;
5532 5751
5533 return group_faults(p, dst_nid) < group_faults(p, src_nid); 5752 if (numa_group) {
5753 src_faults = group_faults(p, src_nid);
5754 dst_faults = group_faults(p, dst_nid);
5755 } else {
5756 src_faults = task_faults(p, src_nid);
5757 dst_faults = task_faults(p, dst_nid);
5534 } 5758 }
5535 5759
5536 /* Migrating away from the preferred node is always bad. */ 5760 return dst_faults < src_faults;
5537 if (src_nid == p->numa_preferred_nid)
5538 return true;
5539
5540 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5541} 5761}
5542 5762
5543#else 5763#else
@@ -6037,8 +6257,8 @@ static unsigned long scale_rt_capacity(int cpu)
6037 * Since we're reading these variables without serialization make sure 6257 * Since we're reading these variables without serialization make sure
6038 * we read them once before doing sanity checks on them. 6258 * we read them once before doing sanity checks on them.
6039 */ 6259 */
6040 age_stamp = ACCESS_ONCE(rq->age_stamp); 6260 age_stamp = READ_ONCE(rq->age_stamp);
6041 avg = ACCESS_ONCE(rq->rt_avg); 6261 avg = READ_ONCE(rq->rt_avg);
6042 delta = __rq_clock_broken(rq) - age_stamp; 6262 delta = __rq_clock_broken(rq) - age_stamp;
6043 6263
6044 if (unlikely(delta < 0)) 6264 if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f2..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
1/* 1/*
2 * kernel/sched/proc.c 2 * kernel/sched/loadavg.c
3 * 3 *
4 * Kernel load calculations, forked from sched/core.c 4 * This file contains the magic bits required to compute the global loadavg
5 * figure. Its a silly number but people think its important. We go through
6 * great pains to make it work on big machines and tickless kernels.
5 */ 7 */
6 8
7#include <linux/export.h> 9#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
81 long nr_active, delta = 0; 83 long nr_active, delta = 0;
82 84
83 nr_active = this_rq->nr_running; 85 nr_active = this_rq->nr_running;
84 nr_active += (long) this_rq->nr_uninterruptible; 86 nr_active += (long)this_rq->nr_uninterruptible;
85 87
86 if (nr_active != this_rq->calc_load_active) { 88 if (nr_active != this_rq->calc_load_active) {
87 delta = nr_active - this_rq->calc_load_active; 89 delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
186 delta = calc_load_fold_active(this_rq); 188 delta = calc_load_fold_active(this_rq);
187 if (delta) { 189 if (delta) {
188 int idx = calc_load_write_idx(); 190 int idx = calc_load_write_idx();
191
189 atomic_long_add(delta, &calc_load_idle[idx]); 192 atomic_long_add(delta, &calc_load_idle[idx]);
190 } 193 }
191} 194}
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
241{ 244{
242 unsigned long result = 1UL << frac_bits; 245 unsigned long result = 1UL << frac_bits;
243 246
244 if (n) for (;;) { 247 if (n) {
245 if (n & 1) { 248 for (;;) {
246 result *= x; 249 if (n & 1) {
247 result += 1UL << (frac_bits - 1); 250 result *= x;
248 result >>= frac_bits; 251 result += 1UL << (frac_bits - 1);
252 result >>= frac_bits;
253 }
254 n >>= 1;
255 if (!n)
256 break;
257 x *= x;
258 x += 1UL << (frac_bits - 1);
259 x >>= frac_bits;
249 } 260 }
250 n >>= 1;
251 if (!n)
252 break;
253 x *= x;
254 x += 1UL << (frac_bits - 1);
255 x >>= frac_bits;
256 } 261 }
257 262
258 return result; 263 return result;
@@ -285,7 +290,6 @@ static unsigned long
285calc_load_n(unsigned long load, unsigned long exp, 290calc_load_n(unsigned long load, unsigned long exp,
286 unsigned long active, unsigned int n) 291 unsigned long active, unsigned int n)
287{ 292{
288
289 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 293 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
290} 294}
291 295
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
339/* 343/*
340 * calc_load - update the avenrun load estimates 10 ticks after the 344 * calc_load - update the avenrun load estimates 10 ticks after the
341 * CPUs have updated calc_load_tasks. 345 * CPUs have updated calc_load_tasks.
346 *
347 * Called from the global timer code.
342 */ 348 */
343void calc_global_load(unsigned long ticks) 349void calc_global_load(unsigned long ticks)
344{ 350{
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
370} 376}
371 377
372/* 378/*
373 * Called from update_cpu_load() to periodically update this CPU's 379 * Called from scheduler_tick() to periodically update this CPU's
374 * active count. 380 * active count.
375 */ 381 */
376static void calc_load_account_active(struct rq *this_rq) 382void calc_global_load_tick(struct rq *this_rq)
377{ 383{
378 long delta; 384 long delta;
379 385
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
386 392
387 this_rq->calc_load_update += LOAD_FREQ; 393 this_rq->calc_load_update += LOAD_FREQ;
388} 394}
389
390/*
391 * End of global load-average stuff
392 */
393
394/*
395 * The exact cpuload at various idx values, calculated at every tick would be
396 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
397 *
398 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
399 * on nth tick when cpu may be busy, then we have:
400 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
401 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
402 *
403 * decay_load_missed() below does efficient calculation of
404 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
405 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
406 *
407 * The calculation is approximated on a 128 point scale.
408 * degrade_zero_ticks is the number of ticks after which load at any
409 * particular idx is approximated to be zero.
410 * degrade_factor is a precomputed table, a row for each load idx.
411 * Each column corresponds to degradation factor for a power of two ticks,
412 * based on 128 point scale.
413 * Example:
414 * row 2, col 3 (=12) says that the degradation at load idx 2 after
415 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
416 *
417 * With this power of 2 load factors, we can degrade the load n times
418 * by looking at 1 bits in n and doing as many mult/shift instead of
419 * n mult/shifts needed by the exact degradation.
420 */
421#define DEGRADE_SHIFT 7
422static const unsigned char
423 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
424static const unsigned char
425 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
426 {0, 0, 0, 0, 0, 0, 0, 0},
427 {64, 32, 8, 0, 0, 0, 0, 0},
428 {96, 72, 40, 12, 1, 0, 0},
429 {112, 98, 75, 43, 15, 1, 0},
430 {120, 112, 98, 76, 45, 16, 2} };
431
432/*
433 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
434 * would be when CPU is idle and so we just decay the old load without
435 * adding any new load.
436 */
437static unsigned long
438decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
439{
440 int j = 0;
441
442 if (!missed_updates)
443 return load;
444
445 if (missed_updates >= degrade_zero_ticks[idx])
446 return 0;
447
448 if (idx == 1)
449 return load >> missed_updates;
450
451 while (missed_updates) {
452 if (missed_updates % 2)
453 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
454
455 missed_updates >>= 1;
456 j++;
457 }
458 return load;
459}
460
461/*
462 * Update rq->cpu_load[] statistics. This function is usually called every
463 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
464 * every tick. We fix it up based on jiffies.
465 */
466static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
467 unsigned long pending_updates)
468{
469 int i, scale;
470
471 this_rq->nr_load_updates++;
472
473 /* Update our load: */
474 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
475 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
476 unsigned long old_load, new_load;
477
478 /* scale is effectively 1 << i now, and >> i divides by scale */
479
480 old_load = this_rq->cpu_load[i];
481 old_load = decay_load_missed(old_load, pending_updates - 1, i);
482 new_load = this_load;
483 /*
484 * Round up the averaging division if load is increasing. This
485 * prevents us from getting stuck on 9 if the load is 10, for
486 * example.
487 */
488 if (new_load > old_load)
489 new_load += scale - 1;
490
491 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
492 }
493
494 sched_avg_update(this_rq);
495}
496
497#ifdef CONFIG_SMP
498static inline unsigned long get_rq_runnable_load(struct rq *rq)
499{
500 return rq->cfs.runnable_load_avg;
501}
502#else
503static inline unsigned long get_rq_runnable_load(struct rq *rq)
504{
505 return rq->load.weight;
506}
507#endif
508
509#ifdef CONFIG_NO_HZ_COMMON
510/*
511 * There is no sane way to deal with nohz on smp when using jiffies because the
512 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
513 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
514 *
515 * Therefore we cannot use the delta approach from the regular tick since that
516 * would seriously skew the load calculation. However we'll make do for those
517 * updates happening while idle (nohz_idle_balance) or coming out of idle
518 * (tick_nohz_idle_exit).
519 *
520 * This means we might still be one tick off for nohz periods.
521 */
522
523/*
524 * Called from nohz_idle_balance() to update the load ratings before doing the
525 * idle balance.
526 */
527void update_idle_cpu_load(struct rq *this_rq)
528{
529 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
530 unsigned long load = get_rq_runnable_load(this_rq);
531 unsigned long pending_updates;
532
533 /*
534 * bail if there's load or we're actually up-to-date.
535 */
536 if (load || curr_jiffies == this_rq->last_load_update_tick)
537 return;
538
539 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
540 this_rq->last_load_update_tick = curr_jiffies;
541
542 __update_cpu_load(this_rq, load, pending_updates);
543}
544
545/*
546 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
547 */
548void update_cpu_load_nohz(void)
549{
550 struct rq *this_rq = this_rq();
551 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
552 unsigned long pending_updates;
553
554 if (curr_jiffies == this_rq->last_load_update_tick)
555 return;
556
557 raw_spin_lock(&this_rq->lock);
558 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
559 if (pending_updates) {
560 this_rq->last_load_update_tick = curr_jiffies;
561 /*
562 * We were idle, this means load 0, the current load might be
563 * !0 due to remote wakeups and the sort.
564 */
565 __update_cpu_load(this_rq, 0, pending_updates);
566 }
567 raw_spin_unlock(&this_rq->lock);
568}
569#endif /* CONFIG_NO_HZ */
570
571/*
572 * Called from scheduler_tick()
573 */
574void update_cpu_load_active(struct rq *this_rq)
575{
576 unsigned long load = get_rq_runnable_load(this_rq);
577 /*
578 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
579 */
580 this_rq->last_load_update_tick = jiffies;
581 __update_cpu_load(this_rq, load, 1);
582
583 calc_load_account_active(this_rq);
584}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..560d2fa623c3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1323 rq = cpu_rq(cpu); 1323 rq = cpu_rq(cpu);
1324 1324
1325 rcu_read_lock(); 1325 rcu_read_lock();
1326 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 1326 curr = READ_ONCE(rq->curr); /* unlocked access */
1327 1327
1328 /* 1328 /*
1329 * If the current task on @p's runqueue is an RT task, then 1329 * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..d62b2882232b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
26extern unsigned long calc_load_update; 26extern unsigned long calc_load_update;
27extern atomic_long_t calc_load_tasks; 27extern atomic_long_t calc_load_tasks;
28 28
29extern void calc_global_load_tick(struct rq *this_rq);
29extern long calc_load_fold_active(struct rq *this_rq); 30extern long calc_load_fold_active(struct rq *this_rq);
31
32#ifdef CONFIG_SMP
30extern void update_cpu_load_active(struct rq *this_rq); 33extern void update_cpu_load_active(struct rq *this_rq);
34#else
35static inline void update_cpu_load_active(struct rq *this_rq) { }
36#endif
31 37
32/* 38/*
33 * Helpers for converting nanosecond timing to jiffy resolution 39 * Helpers for converting nanosecond timing to jiffy resolution
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
707 713
708static inline u64 __rq_clock_broken(struct rq *rq) 714static inline u64 __rq_clock_broken(struct rq *rq)
709{ 715{
710 return ACCESS_ONCE(rq->clock); 716 return READ_ONCE(rq->clock);
711} 717}
712 718
713static inline u64 rq_clock(struct rq *rq) 719static inline u64 rq_clock(struct rq *rq)
@@ -1284,7 +1290,6 @@ extern void update_max_interval(void);
1284extern void init_sched_dl_class(void); 1290extern void init_sched_dl_class(void);
1285extern void init_sched_rt_class(void); 1291extern void init_sched_rt_class(void);
1286extern void init_sched_fair_class(void); 1292extern void init_sched_fair_class(void);
1287extern void init_sched_dl_class(void);
1288 1293
1289extern void resched_curr(struct rq *rq); 1294extern void resched_curr(struct rq *rq);
1290extern void resched_cpu(int cpu); 1295extern void resched_cpu(int cpu);
@@ -1298,8 +1303,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1298 1303
1299unsigned long to_ratio(u64 period, u64 runtime); 1304unsigned long to_ratio(u64 period, u64 runtime);
1300 1305
1301extern void update_idle_cpu_load(struct rq *this_rq);
1302
1303extern void init_task_runnable_average(struct task_struct *p); 1306extern void init_task_runnable_average(struct task_struct *p);
1304 1307
1305static inline void add_nr_running(struct rq *rq, unsigned count) 1308static inline void add_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..077ebbd5e10f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
174{ 174{
175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
176 176
177 if (!cputimer->running) 177 /* Check if cputimer isn't running. This is accessed without locking. */
178 if (!READ_ONCE(cputimer->running))
178 return false; 179 return false;
179 180
180 /* 181 /*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
215 if (!cputimer_running(tsk)) 216 if (!cputimer_running(tsk))
216 return; 217 return;
217 218
218 raw_spin_lock(&cputimer->lock); 219 atomic64_add(cputime, &cputimer->cputime_atomic.utime);
219 cputimer->cputime.utime += cputime;
220 raw_spin_unlock(&cputimer->lock);
221} 220}
222 221
223/** 222/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
238 if (!cputimer_running(tsk)) 237 if (!cputimer_running(tsk))
239 return; 238 return;
240 239
241 raw_spin_lock(&cputimer->lock); 240 atomic64_add(cputime, &cputimer->cputime_atomic.stime);
242 cputimer->cputime.stime += cputime;
243 raw_spin_unlock(&cputimer->lock);
244} 241}
245 242
246/** 243/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
261 if (!cputimer_running(tsk)) 258 if (!cputimer_running(tsk))
262 return; 259 return;
263 260
264 raw_spin_lock(&cputimer->lock); 261 atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
265 cputimer->cputime.sum_exec_runtime += ns;
266 raw_spin_unlock(&cputimer->lock);
267} 262}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9bc82329eaad..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
601 601
602__sched int bit_wait_timeout(struct wait_bit_key *word) 602__sched int bit_wait_timeout(struct wait_bit_key *word)
603{ 603{
604 unsigned long now = ACCESS_ONCE(jiffies); 604 unsigned long now = READ_ONCE(jiffies);
605 if (signal_pending_state(current->state, current)) 605 if (signal_pending_state(current->state, current))
606 return 1; 606 return 1;
607 if (time_after_eq(now, word->timeout)) 607 if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
613 613
614__sched int bit_wait_io_timeout(struct wait_bit_key *word) 614__sched int bit_wait_io_timeout(struct wait_bit_key *word)
615{ 615{
616 unsigned long now = ACCESS_ONCE(jiffies); 616 unsigned long now = READ_ONCE(jiffies);
617 if (signal_pending_state(current->state, current)) 617 if (signal_pending_state(current->state, current))
618 return 1; 618 return 1;
619 if (time_after_eq(now, word->timeout)) 619 if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..f19833b5db3c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
245 * RETURNS: 245 * RETURNS:
246 * %true if @mask is set, %false if made noop because @task was dying. 246 * %true if @mask is set, %false if made noop because @task was dying.
247 */ 247 */
248bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) 248bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
249{ 249{
250 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | 250 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
251 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); 251 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
297 * CONTEXT: 297 * CONTEXT:
298 * Must be called with @task->sighand->siglock held. 298 * Must be called with @task->sighand->siglock held.
299 */ 299 */
300void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) 300void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
301{ 301{
302 BUG_ON(mask & ~JOBCTL_PENDING_MASK); 302 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
303 303
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
2000 struct signal_struct *sig = current->signal; 2000 struct signal_struct *sig = current->signal;
2001 2001
2002 if (!(current->jobctl & JOBCTL_STOP_PENDING)) { 2002 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
2003 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; 2003 unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
2004 struct task_struct *t; 2004 struct task_struct *t;
2005 2005
2006 /* signr will be recorded in task->jobctl for retries */ 2006 /* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..fd643d8c4b42 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
211 return err; 211 return err;
212} 212}
213 213
214struct irq_cpu_stop_queue_work_info {
215 int cpu1;
216 int cpu2;
217 struct cpu_stop_work *work1;
218 struct cpu_stop_work *work2;
219};
220
221/*
222 * This function is always run with irqs and preemption disabled.
223 * This guarantees that both work1 and work2 get queued, before
224 * our local migrate thread gets the chance to preempt us.
225 */
226static void irq_cpu_stop_queue_work(void *arg)
227{
228 struct irq_cpu_stop_queue_work_info *info = arg;
229 cpu_stop_queue_work(info->cpu1, info->work1);
230 cpu_stop_queue_work(info->cpu2, info->work2);
231}
232
233/** 214/**
234 * stop_two_cpus - stops two cpus 215 * stop_two_cpus - stops two cpus
235 * @cpu1: the cpu to stop 216 * @cpu1: the cpu to stop
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
245{ 226{
246 struct cpu_stop_done done; 227 struct cpu_stop_done done;
247 struct cpu_stop_work work1, work2; 228 struct cpu_stop_work work1, work2;
248 struct irq_cpu_stop_queue_work_info call_args;
249 struct multi_stop_data msdata; 229 struct multi_stop_data msdata;
250 230
251 preempt_disable(); 231 preempt_disable();
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
262 .done = &done 242 .done = &done
263 }; 243 };
264 244
265 call_args = (struct irq_cpu_stop_queue_work_info){
266 .cpu1 = cpu1,
267 .cpu2 = cpu2,
268 .work1 = &work1,
269 .work2 = &work2,
270 };
271
272 cpu_stop_init_done(&done, 2); 245 cpu_stop_init_done(&done, 2);
273 set_state(&msdata, MULTI_STOP_PREPARE); 246 set_state(&msdata, MULTI_STOP_PREPARE);
274 247
@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
285 return -ENOENT; 258 return -ENOENT;
286 } 259 }
287 260
288 lg_local_lock(&stop_cpus_lock); 261 lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
289 /* 262 cpu_stop_queue_work(cpu1, &work1);
290 * Queuing needs to be done by the lowest numbered CPU, to ensure 263 cpu_stop_queue_work(cpu2, &work2);
291 * that works are always queued in the same order on every CPU. 264 lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
292 * This prevents deadlocks. 265
293 */
294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work,
296 &call_args, 1);
297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable(); 266 preempt_enable();
299 267
300 wait_for_completion(&done.completion); 268 wait_for_completion(&done.completion);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196 return 0; 196 return 0;
197} 197}
198 198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 199/*
200 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
201 * to avoid race conditions with concurrent updates to cputime.
202 */
203static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200{ 204{
201 if (b->utime > a->utime) 205 u64 curr_cputime;
202 a->utime = b->utime; 206retry:
207 curr_cputime = atomic64_read(cputime);
208 if (sum_cputime > curr_cputime) {
209 if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
210 goto retry;
211 }
212}
203 213
204 if (b->stime > a->stime) 214static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
205 a->stime = b->stime; 215{
216 __update_gt_cputime(&cputime_atomic->utime, sum->utime);
217 __update_gt_cputime(&cputime_atomic->stime, sum->stime);
218 __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
219}
206 220
207 if (b->sum_exec_runtime > a->sum_exec_runtime) 221/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
208 a->sum_exec_runtime = b->sum_exec_runtime; 222static inline void sample_cputime_atomic(struct task_cputime *times,
223 struct task_cputime_atomic *atomic_times)
224{
225 times->utime = atomic64_read(&atomic_times->utime);
226 times->stime = atomic64_read(&atomic_times->stime);
227 times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
209} 228}
210 229
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) 230void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{ 231{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 232 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum; 233 struct task_cputime sum;
215 unsigned long flags;
216 234
217 if (!cputimer->running) { 235 /* Check if cputimer isn't running. This is accessed without locking. */
236 if (!READ_ONCE(cputimer->running)) {
218 /* 237 /*
219 * The POSIX timer interface allows for absolute time expiry 238 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have 239 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start 240 * to synchronize the timer to the clock every time we start it.
222 * it.
223 */ 241 */
224 thread_group_cputime(tsk, &sum); 242 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags); 243 update_gt_cputime(&cputimer->cputime_atomic, &sum);
226 cputimer->running = 1; 244
227 update_gt_cputime(&cputimer->cputime, &sum); 245 /*
228 } else 246 * We're setting cputimer->running without a lock. Ensure
229 raw_spin_lock_irqsave(&cputimer->lock, flags); 247 * this only gets written to in one operation. We set
230 *times = cputimer->cputime; 248 * running after update_gt_cputime() as a small optimization,
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 249 * but barriers are not required because update_gt_cputime()
250 * can handle concurrent updates.
251 */
252 WRITE_ONCE(cputimer->running, 1);
253 }
254 sample_cputime_atomic(times, &cputimer->cputime_atomic);
232} 255}
233 256
234/* 257/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582 if (!task_cputime_zero(&tsk->cputime_expires)) 605 if (!task_cputime_zero(&tsk->cputime_expires))
583 return false; 606 return false;
584 607
585 if (tsk->signal->cputimer.running) 608 /* Check if cputimer is running. This is accessed without locking. */
609 if (READ_ONCE(tsk->signal->cputimer.running))
586 return false; 610 return false;
587 611
588 return true; 612 return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
852 /* 876 /*
853 * Check for the special case thread timers. 877 * Check for the special case thread timers.
854 */ 878 */
855 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); 879 soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
856 if (soft != RLIM_INFINITY) { 880 if (soft != RLIM_INFINITY) {
857 unsigned long hard = 881 unsigned long hard =
858 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); 882 READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
859 883
860 if (hard != RLIM_INFINITY && 884 if (hard != RLIM_INFINITY &&
861 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 885 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882 } 906 }
883} 907}
884 908
885static void stop_process_timers(struct signal_struct *sig) 909static inline void stop_process_timers(struct signal_struct *sig)
886{ 910{
887 struct thread_group_cputimer *cputimer = &sig->cputimer; 911 struct thread_group_cputimer *cputimer = &sig->cputimer;
888 unsigned long flags;
889 912
890 raw_spin_lock_irqsave(&cputimer->lock, flags); 913 /* Turn off cputimer->running. This is done without locking. */
891 cputimer->running = 0; 914 WRITE_ONCE(cputimer->running, 0);
892 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
893} 915}
894 916
895static u32 onecputick; 917static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
958 SIGPROF); 980 SIGPROF);
959 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 981 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
960 SIGVTALRM); 982 SIGVTALRM);
961 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); 983 soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
962 if (soft != RLIM_INFINITY) { 984 if (soft != RLIM_INFINITY) {
963 unsigned long psecs = cputime_to_secs(ptime); 985 unsigned long psecs = cputime_to_secs(ptime);
964 unsigned long hard = 986 unsigned long hard =
965 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); 987 READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
966 cputime_t x; 988 cputime_t x;
967 if (psecs >= hard) { 989 if (psecs >= hard) {
968 /* 990 /*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111 } 1133 }
1112 1134
1113 sig = tsk->signal; 1135 sig = tsk->signal;
1114 if (sig->cputimer.running) { 1136 /* Check if cputimer is running. This is accessed without locking. */
1137 if (READ_ONCE(sig->cputimer.running)) {
1115 struct task_cputime group_sample; 1138 struct task_cputime group_sample;
1116 1139
1117 raw_spin_lock(&sig->cputimer.lock); 1140 sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
1118 group_sample = sig->cputimer.cputime;
1119 raw_spin_unlock(&sig->cputimer.lock);
1120 1141
1121 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1142 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1122 return 1; 1143 return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1157 * If there are any active process wide timers (POSIX 1.b, itimers, 1178 * If there are any active process wide timers (POSIX 1.b, itimers,
1158 * RLIMIT_CPU) cputimer must be running. 1179 * RLIMIT_CPU) cputimer must be running.
1159 */ 1180 */
1160 if (tsk->signal->cputimer.running) 1181 if (READ_ONCE(tsk->signal->cputimer.running))
1161 check_process_timers(tsk, &firing); 1182 check_process_timers(tsk, &firing);
1162 1183
1163 /* 1184 /*
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 4f134d8907a7..f610b2a10b3e 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -191,7 +191,7 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
191 /* Update distances based on topology */ 191 /* Update distances based on topology */
192 for_each_cpu(cpu, update_mask) { 192 for_each_cpu(cpu, update_mask) {
193 if (cpu_rmap_copy_neigh(rmap, cpu, 193 if (cpu_rmap_copy_neigh(rmap, cpu,
194 topology_thread_cpumask(cpu), 1)) 194 topology_sibling_cpumask(cpu), 1))
195 continue; 195 continue;
196 if (cpu_rmap_copy_neigh(rmap, cpu, 196 if (cpu_rmap_copy_neigh(rmap, cpu,
197 topology_core_cpumask(cpu), 2)) 197 topology_core_cpumask(cpu), 2))
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3d2aa27b845b..061550de77bc 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/bitops.h> 34#include <linux/bitops.h>
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/preempt_mask.h> /* in_interrupt() */ 36#include <linux/preempt.h> /* in_interrupt() */
37 37
38 38
39/* 39/*
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index fe9a32591c24..3a5f2b366d84 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -85,7 +85,8 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
85 * @str: The string to measure. 85 * @str: The string to measure.
86 * @count: Maximum count (including NUL character) 86 * @count: Maximum count (including NUL character)
87 * 87 *
88 * Context: User context only. This function may sleep. 88 * Context: User context only. This function may sleep if pagefaults are
89 * enabled.
89 * 90 *
90 * Get the size of a NUL-terminated string in user space. 91 * Get the size of a NUL-terminated string in user space.
91 * 92 *
@@ -121,7 +122,8 @@ EXPORT_SYMBOL(strnlen_user);
121 * strlen_user: - Get the size of a user string INCLUDING final NUL. 122 * strlen_user: - Get the size of a user string INCLUDING final NUL.
122 * @str: The string to measure. 123 * @str: The string to measure.
123 * 124 *
124 * Context: User context only. This function may sleep. 125 * Context: User context only. This function may sleep if pagefaults are
126 * enabled.
125 * 127 *
126 * Get the size of a NUL-terminated string in user space. 128 * Get the size of a NUL-terminated string in user space.
127 * 129 *
diff --git a/mm/memory.c b/mm/memory.c
index 22e037e3364e..17734c3c1183 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3737,7 +3737,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3737} 3737}
3738 3738
3739#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) 3739#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3740void might_fault(void) 3740void __might_fault(const char *file, int line)
3741{ 3741{
3742 /* 3742 /*
3743 * Some code (nfs/sunrpc) uses socket ops on kernel memory while 3743 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3747,21 +3747,15 @@ void might_fault(void)
3747 */ 3747 */
3748 if (segment_eq(get_fs(), KERNEL_DS)) 3748 if (segment_eq(get_fs(), KERNEL_DS))
3749 return; 3749 return;
3750 3750 if (pagefault_disabled())
3751 /*
3752 * it would be nicer only to annotate paths which are not under
3753 * pagefault_disable, however that requires a larger audit and
3754 * providing helpers like get_user_atomic.
3755 */
3756 if (in_atomic())
3757 return; 3751 return;
3758 3752 __might_sleep(file, line, 0);
3759 __might_sleep(__FILE__, __LINE__, 0); 3753#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3760
3761 if (current->mm) 3754 if (current->mm)
3762 might_lock_read(&current->mm->mmap_sem); 3755 might_lock_read(&current->mm->mmap_sem);
3756#endif
3763} 3757}
3764EXPORT_SYMBOL(might_fault); 3758EXPORT_SYMBOL(__might_fault);
3765#endif 3759#endif
3766 3760
3767#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 3761#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)