aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cputopology.txt37
-rw-r--r--Documentation/scheduler/sched-deadline.txt184
-rw-r--r--arch/alpha/mm/fault.c5
-rw-r--r--arch/arc/include/asm/futex.h10
-rw-r--r--arch/arc/mm/fault.c2
-rw-r--r--arch/arm/include/asm/futex.h13
-rw-r--r--arch/arm/include/asm/topology.h2
-rw-r--r--arch/arm/mm/fault.c2
-rw-r--r--arch/arm/mm/highmem.c3
-rw-r--r--arch/arm64/include/asm/futex.h4
-rw-r--r--arch/arm64/include/asm/topology.h2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/avr32/include/asm/uaccess.h12
-rw-r--r--arch/avr32/mm/fault.c4
-rw-r--r--arch/cris/mm/fault.c6
-rw-r--r--arch/frv/mm/fault.c4
-rw-r--r--arch/frv/mm/highmem.c2
-rw-r--r--arch/hexagon/include/asm/uaccess.h3
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/ia64/mm/fault.c4
-rw-r--r--arch/m32r/include/asm/uaccess.h30
-rw-r--r--arch/m32r/mm/fault.c8
-rw-r--r--arch/m68k/include/asm/irqflags.h3
-rw-r--r--arch/m68k/mm/fault.c4
-rw-r--r--arch/metag/mm/fault.c2
-rw-r--r--arch/metag/mm/highmem.c4
-rw-r--r--arch/microblaze/include/asm/uaccess.h6
-rw-r--r--arch/microblaze/mm/fault.c8
-rw-r--r--arch/microblaze/mm/highmem.c4
-rw-r--r--arch/mips/include/asm/topology.h2
-rw-r--r--arch/mips/include/asm/uaccess.h45
-rw-r--r--arch/mips/kernel/signal-common.h9
-rw-r--r--arch/mips/mm/fault.c4
-rw-r--r--arch/mips/mm/highmem.c5
-rw-r--r--arch/mips/mm/init.c2
-rw-r--r--arch/mn10300/include/asm/highmem.h3
-rw-r--r--arch/mn10300/mm/fault.c4
-rw-r--r--arch/nios2/mm/fault.c2
-rw-r--r--arch/parisc/include/asm/cacheflush.h2
-rw-r--r--arch/parisc/kernel/traps.c4
-rw-r--r--arch/parisc/mm/fault.c4
-rw-r--r--arch/powerpc/include/asm/topology.h2
-rw-r--r--arch/powerpc/lib/vmx-helper.c11
-rw-r--r--arch/powerpc/mm/fault.c9
-rw-r--r--arch/powerpc/mm/highmem.c4
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
-rw-r--r--arch/s390/include/asm/topology.h3
-rw-r--r--arch/s390/include/asm/uaccess.h15
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/score/include/asm/uaccess.h15
-rw-r--r--arch/score/mm/fault.c3
-rw-r--r--arch/sh/mm/fault.c5
-rw-r--r--arch/sparc/include/asm/topology_64.h2
-rw-r--r--arch/sparc/mm/fault_32.c4
-rw-r--r--arch/sparc/mm/fault_64.c4
-rw-r--r--arch/sparc/mm/highmem.c4
-rw-r--r--arch/sparc/mm/init_64.c2
-rw-r--r--arch/tile/include/asm/topology.h2
-rw-r--r--arch/tile/include/asm/uaccess.h18
-rw-r--r--arch/tile/mm/fault.c4
-rw-r--r--arch/tile/mm/highmem.c3
-rw-r--r--arch/um/kernel/trap.c5
-rw-r--r--arch/unicore32/mm/fault.c2
-rw-r--r--arch/x86/include/asm/smp.h10
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/uaccess.h15
-rw-r--r--arch/x86/include/asm/uaccess_32.h6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c6
-rw-r--r--arch/x86/kernel/cpu/proc.c3
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/smpboot.c42
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/lib/usercopy_32.c6
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/x86/mm/highmem_32.c3
-rw-r--r--arch/x86/mm/iomap_32.c2
-rw-r--r--arch/xtensa/mm/fault.c4
-rw-r--r--arch/xtensa/mm/highmem.c2
-rw-r--r--block/blk-mq-cpumap.c2
-rw-r--r--drivers/acpi/acpi_pad.c2
-rw-r--r--drivers/base/topology.c2
-rw-r--r--drivers/cpufreq/acpi-cpufreq.c5
-rw-r--r--drivers/cpufreq/p4-clockmod.c2
-rw-r--r--drivers/cpufreq/powernow-k8.c13
-rw-r--r--drivers/cpufreq/speedstep-ich.c2
-rw-r--r--drivers/crypto/vmx/aes.c8
-rw-r--r--drivers/crypto/vmx/aes_cbc.c6
-rw-r--r--drivers/crypto/vmx/ghash.c8
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c3
-rw-r--r--drivers/hwmon/coretemp.c3
-rw-r--r--drivers/net/ethernet/sfc/efx.c2
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c2
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/service.c4
-rw-r--r--include/asm-generic/futex.h7
-rw-r--r--include/linux/bottom_half.h1
-rw-r--r--include/linux/hardirq.h2
-rw-r--r--include/linux/highmem.h2
-rw-r--r--include/linux/init_task.h5
-rw-r--r--include/linux/io-mapping.h2
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/preempt.h123
-rw-r--r--include/linux/preempt_mask.h117
-rw-r--r--include/linux/sched.h118
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/linux/uaccess.h48
-rw-r--r--include/linux/wait.h17
-rw-r--r--include/trace/events/sched.h3
-rw-r--r--ipc/mqueue.c54
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c92
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c276
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h10
-rw-r--r--kernel/sched/stats.h15
-rw-r--r--kernel/sched/wait.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/time/posix-cpu-timers.c87
-rw-r--r--lib/cpu_rmap.c2
-rw-r--r--lib/radix-tree.c2
-rw-r--r--lib/strnlen_user.c6
-rw-r--r--mm/memory.c18
128 files changed, 1245 insertions, 832 deletions
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 0aad6deb2d96..12b1b25b4da9 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -1,6 +1,6 @@
1 1
2Export CPU topology info via sysfs. Items (attributes) are similar 2Export CPU topology info via sysfs. Items (attributes) are similar
3to /proc/cpuinfo. 3to /proc/cpuinfo output of some architectures:
4 4
51) /sys/devices/system/cpu/cpuX/topology/physical_package_id: 51) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
6 6
@@ -23,20 +23,35 @@ to /proc/cpuinfo.
234) /sys/devices/system/cpu/cpuX/topology/thread_siblings: 234) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
24 24
25 internal kernel map of cpuX's hardware threads within the same 25 internal kernel map of cpuX's hardware threads within the same
26 core as cpuX 26 core as cpuX.
27 27
285) /sys/devices/system/cpu/cpuX/topology/core_siblings: 285) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
29
30 human-readable list of cpuX's hardware threads within the same
31 core as cpuX.
32
336) /sys/devices/system/cpu/cpuX/topology/core_siblings:
29 34
30 internal kernel map of cpuX's hardware threads within the same 35 internal kernel map of cpuX's hardware threads within the same
31 physical_package_id. 36 physical_package_id.
32 37
336) /sys/devices/system/cpu/cpuX/topology/book_siblings: 387) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
39
40 human-readable list of cpuX's hardware threads within the same
41 physical_package_id.
42
438) /sys/devices/system/cpu/cpuX/topology/book_siblings:
34 44
35 internal kernel map of cpuX's hardware threads within the same 45 internal kernel map of cpuX's hardware threads within the same
36 book_id. 46 book_id.
37 47
489) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
49
50 human-readable list of cpuX's hardware threads within the same
51 book_id.
52
38To implement it in an architecture-neutral way, a new source file, 53To implement it in an architecture-neutral way, a new source file,
39drivers/base/topology.c, is to export the 4 or 6 attributes. The two book 54drivers/base/topology.c, is to export the 6 or 9 attributes. The three book
40related sysfs files will only be created if CONFIG_SCHED_BOOK is selected. 55related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
41 56
42For an architecture to support this feature, it must define some of 57For an architecture to support this feature, it must define some of
@@ -44,20 +59,22 @@ these macros in include/asm-XXX/topology.h:
44#define topology_physical_package_id(cpu) 59#define topology_physical_package_id(cpu)
45#define topology_core_id(cpu) 60#define topology_core_id(cpu)
46#define topology_book_id(cpu) 61#define topology_book_id(cpu)
47#define topology_thread_cpumask(cpu) 62#define topology_sibling_cpumask(cpu)
48#define topology_core_cpumask(cpu) 63#define topology_core_cpumask(cpu)
49#define topology_book_cpumask(cpu) 64#define topology_book_cpumask(cpu)
50 65
51The type of **_id is int. 66The type of **_id macros is int.
52The type of siblings is (const) struct cpumask *. 67The type of **_cpumask macros is (const) struct cpumask *. The latter
68correspond with appropriate **_siblings sysfs attributes (except for
69topology_sibling_cpumask() which corresponds with thread_siblings).
53 70
54To be consistent on all architectures, include/linux/topology.h 71To be consistent on all architectures, include/linux/topology.h
55provides default definitions for any of the above macros that are 72provides default definitions for any of the above macros that are
56not defined by include/asm-XXX/topology.h: 73not defined by include/asm-XXX/topology.h:
571) physical_package_id: -1 741) physical_package_id: -1
582) core_id: 0 752) core_id: 0
593) thread_siblings: just the given CPU 763) sibling_cpumask: just the given CPU
604) core_siblings: just the given CPU 774) core_cpumask: just the given CPU
61 78
62For architectures that don't support books (CONFIG_SCHED_BOOK) there are no 79For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
63default definitions for topology_book_id() and topology_book_cpumask(). 80default definitions for topology_book_id() and topology_book_cpumask().
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 21461a0441c1..e114513a2731 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -8,6 +8,10 @@ CONTENTS
8 1. Overview 8 1. Overview
9 2. Scheduling algorithm 9 2. Scheduling algorithm
10 3. Scheduling Real-Time Tasks 10 3. Scheduling Real-Time Tasks
11 3.1 Definitions
12 3.2 Schedulability Analysis for Uniprocessor Systems
13 3.3 Schedulability Analysis for Multiprocessor Systems
14 3.4 Relationship with SCHED_DEADLINE Parameters
11 4. Bandwidth management 15 4. Bandwidth management
12 4.1 System-wide settings 16 4.1 System-wide settings
13 4.2 Task interface 17 4.2 Task interface
@@ -43,7 +47,7 @@ CONTENTS
43 "deadline", to schedule tasks. A SCHED_DEADLINE task should receive 47 "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
44 "runtime" microseconds of execution time every "period" microseconds, and 48 "runtime" microseconds of execution time every "period" microseconds, and
45 these "runtime" microseconds are available within "deadline" microseconds 49 these "runtime" microseconds are available within "deadline" microseconds
46 from the beginning of the period. In order to implement this behaviour, 50 from the beginning of the period. In order to implement this behavior,
47 every time the task wakes up, the scheduler computes a "scheduling deadline" 51 every time the task wakes up, the scheduler computes a "scheduling deadline"
48 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then 52 consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
49 scheduled using EDF[1] on these scheduling deadlines (the task with the 53 scheduled using EDF[1] on these scheduling deadlines (the task with the
@@ -52,7 +56,7 @@ CONTENTS
52 "admission control" strategy (see Section "4. Bandwidth management") is used 56 "admission control" strategy (see Section "4. Bandwidth management") is used
53 (clearly, if the system is overloaded this guarantee cannot be respected). 57 (clearly, if the system is overloaded this guarantee cannot be respected).
54 58
55 Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so 59 Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
56 that each task runs for at most its runtime every period, avoiding any 60 that each task runs for at most its runtime every period, avoiding any
57 interference between different tasks (bandwidth isolation), while the EDF[1] 61 interference between different tasks (bandwidth isolation), while the EDF[1]
58 algorithm selects the task with the earliest scheduling deadline as the one 62 algorithm selects the task with the earliest scheduling deadline as the one
@@ -63,7 +67,7 @@ CONTENTS
63 In more details, the CBS algorithm assigns scheduling deadlines to 67 In more details, the CBS algorithm assigns scheduling deadlines to
64 tasks in the following way: 68 tasks in the following way:
65 69
66 - Each SCHED_DEADLINE task is characterised by the "runtime", 70 - Each SCHED_DEADLINE task is characterized by the "runtime",
67 "deadline", and "period" parameters; 71 "deadline", and "period" parameters;
68 72
69 - The state of the task is described by a "scheduling deadline", and 73 - The state of the task is described by a "scheduling deadline", and
@@ -78,7 +82,7 @@ CONTENTS
78 82
79 then, if the scheduling deadline is smaller than the current time, or 83 then, if the scheduling deadline is smaller than the current time, or
80 this condition is verified, the scheduling deadline and the 84 this condition is verified, the scheduling deadline and the
81 remaining runtime are re-initialised as 85 remaining runtime are re-initialized as
82 86
83 scheduling deadline = current time + deadline 87 scheduling deadline = current time + deadline
84 remaining runtime = runtime 88 remaining runtime = runtime
@@ -126,31 +130,37 @@ CONTENTS
126 suited for periodic or sporadic real-time tasks that need guarantees on their 130 suited for periodic or sporadic real-time tasks that need guarantees on their
127 timing behavior, e.g., multimedia, streaming, control applications, etc. 131 timing behavior, e.g., multimedia, streaming, control applications, etc.
128 132
1333.1 Definitions
134------------------------
135
129 A typical real-time task is composed of a repetition of computation phases 136 A typical real-time task is composed of a repetition of computation phases
130 (task instances, or jobs) which are activated on a periodic or sporadic 137 (task instances, or jobs) which are activated on a periodic or sporadic
131 fashion. 138 fashion.
132 Each job J_j (where J_j is the j^th job of the task) is characterised by an 139 Each job J_j (where J_j is the j^th job of the task) is characterized by an
133 arrival time r_j (the time when the job starts), an amount of computation 140 arrival time r_j (the time when the job starts), an amount of computation
134 time c_j needed to finish the job, and a job absolute deadline d_j, which 141 time c_j needed to finish the job, and a job absolute deadline d_j, which
135 is the time within which the job should be finished. The maximum execution 142 is the time within which the job should be finished. The maximum execution
136 time max_j{c_j} is called "Worst Case Execution Time" (WCET) for the task. 143 time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
137 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or 144 A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
138 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, 145 sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
139 d_j = r_j + D, where D is the task's relative deadline. 146 d_j = r_j + D, where D is the task's relative deadline.
140 The utilisation of a real-time task is defined as the ratio between its 147 Summing up, a real-time task can be described as
148 Task = (WCET, D, P)
149
150 The utilization of a real-time task is defined as the ratio between its
141 WCET and its period (or minimum inter-arrival time), and represents 151 WCET and its period (or minimum inter-arrival time), and represents
142 the fraction of CPU time needed to execute the task. 152 the fraction of CPU time needed to execute the task.
143 153
144 If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal 154 If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
145 to the number of CPUs), then the scheduler is unable to respect all the 155 to the number of CPUs), then the scheduler is unable to respect all the
146 deadlines. 156 deadlines.
147 Note that total utilisation is defined as the sum of the utilisations 157 Note that total utilization is defined as the sum of the utilizations
148 WCET_i/P_i over all the real-time tasks in the system. When considering 158 WCET_i/P_i over all the real-time tasks in the system. When considering
149 multiple real-time tasks, the parameters of the i-th task are indicated 159 multiple real-time tasks, the parameters of the i-th task are indicated
150 with the "_i" suffix. 160 with the "_i" suffix.
151 Moreover, if the total utilisation is larger than M, then we risk starving 161 Moreover, if the total utilization is larger than M, then we risk starving
152 non- real-time tasks by real-time tasks. 162 non- real-time tasks by real-time tasks.
153 If, instead, the total utilisation is smaller than M, then non real-time 163 If, instead, the total utilization is smaller than M, then non real-time
154 tasks will not be starved and the system might be able to respect all the 164 tasks will not be starved and the system might be able to respect all the
155 deadlines. 165 deadlines.
156 As a matter of fact, in this case it is possible to provide an upper bound 166 As a matter of fact, in this case it is possible to provide an upper bound
@@ -159,38 +169,119 @@ CONTENTS
159 More precisely, it can be proven that using a global EDF scheduler the 169 More precisely, it can be proven that using a global EDF scheduler the
160 maximum tardiness of each task is smaller or equal than 170 maximum tardiness of each task is smaller or equal than
161 ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max 171 ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
162 where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i} 172 where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
163 is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation. 173 is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
174 utilization[12].
175
1763.2 Schedulability Analysis for Uniprocessor Systems
177------------------------
164 178
165 If M=1 (uniprocessor system), or in case of partitioned scheduling (each 179 If M=1 (uniprocessor system), or in case of partitioned scheduling (each
166 real-time task is statically assigned to one and only one CPU), it is 180 real-time task is statically assigned to one and only one CPU), it is
167 possible to formally check if all the deadlines are respected. 181 possible to formally check if all the deadlines are respected.
168 If D_i = P_i for all tasks, then EDF is able to respect all the deadlines 182 If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
169 of all the tasks executing on a CPU if and only if the total utilisation 183 of all the tasks executing on a CPU if and only if the total utilization
170 of the tasks running on such a CPU is smaller or equal than 1. 184 of the tasks running on such a CPU is smaller or equal than 1.
171 If D_i != P_i for some task, then it is possible to define the density of 185 If D_i != P_i for some task, then it is possible to define the density of
172 a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines 186 a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
173 of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the 187 of all the tasks running on a CPU if the sum of the densities of the tasks
174 densities of the tasks running on such a CPU is smaller or equal than 1 188 running on such a CPU is smaller or equal than 1:
175 (notice that this condition is only sufficient, and not necessary). 189 sum(WCET_i / min{D_i, P_i}) <= 1
190 It is important to notice that this condition is only sufficient, and not
191 necessary: there are task sets that are schedulable, but do not respect the
192 condition. For example, consider the task set {Task_1,Task_2} composed by
193 Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
194 EDF is clearly able to schedule the two tasks without missing any deadline
195 (Task_1 is scheduled as soon as it is released, and finishes just in time
196 to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
197 its response time cannot be larger than 50ms + 10ms = 60ms) even if
198 50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
199 Of course it is possible to test the exact schedulability of tasks with
200 D_i != P_i (checking a condition that is both sufficient and necessary),
201 but this cannot be done by comparing the total utilization or density with
202 a constant. Instead, the so called "processor demand" approach can be used,
203 computing the total amount of CPU time h(t) needed by all the tasks to
204 respect all of their deadlines in a time interval of size t, and comparing
205 such a time with the interval size t. If h(t) is smaller than t (that is,
206 the amount of time needed by the tasks in a time interval of size t is
207 smaller than the size of the interval) for all the possible values of t, then
208 EDF is able to schedule the tasks respecting all of their deadlines. Since
209 performing this check for all possible values of t is impossible, it has been
210 proven[4,5,6] that it is sufficient to perform the test for values of t
211 between 0 and a maximum value L. The cited papers contain all of the
212 mathematical details and explain how to compute h(t) and L.
213 In any case, this kind of analysis is too complex as well as too
214 time-consuming to be performed on-line. Hence, as explained in Section
215 4 Linux uses an admission test based on the tasks' utilizations.
216
2173.3 Schedulability Analysis for Multiprocessor Systems
218------------------------
176 219
177 On multiprocessor systems with global EDF scheduling (non partitioned 220 On multiprocessor systems with global EDF scheduling (non partitioned
178 systems), a sufficient test for schedulability can not be based on the 221 systems), a sufficient test for schedulability can not be based on the
179 utilisations (it can be shown that task sets with utilisations slightly 222 utilizations or densities: it can be shown that even if D_i = P_i task
180 larger than 1 can miss deadlines regardless of the number of CPUs M). 223 sets with utilizations slightly larger than 1 can miss deadlines regardless
181 However, as previously stated, enforcing that the total utilisation is smaller 224 of the number of CPUs.
182 than M is enough to guarantee that non real-time tasks are not starved and 225
183 that the tardiness of real-time tasks has an upper bound. 226 Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
227 CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
228 and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
229 arbitrarily small worst case execution time (indicated as "e" here) and a
230 period smaller than the one of the first task. Hence, if all the tasks
231 activate at the same time t, global EDF schedules these M tasks first
232 (because their absolute deadlines are equal to t + P - 1, hence they are
233 smaller than the absolute deadline of Task_1, which is t + P). As a
234 result, Task_1 can be scheduled only at time t + e, and will finish at
235 time t + e + P, after its absolute deadline. The total utilization of the
236 task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
237 values of e this can become very close to 1. This is known as "Dhall's
238 effect"[7]. Note: the example in the original paper by Dhall has been
239 slightly simplified here (for example, Dhall more correctly computed
240 lim_{e->0}U).
241
242 More complex schedulability tests for global EDF have been developed in
243 real-time literature[8,9], but they are not based on a simple comparison
244 between total utilization (or density) and a fixed constant. If all tasks
245 have D_i = P_i, a sufficient schedulability condition can be expressed in
246 a simple way:
247 sum(WCET_i / P_i) <= M - (M - 1) · U_max
248 where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
249 M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
250 just confirms the Dhall's effect. A more complete survey of the literature
251 about schedulability tests for multi-processor real-time scheduling can be
252 found in [11].
253
254 As seen, enforcing that the total utilization is smaller than M does not
255 guarantee that global EDF schedules the tasks without missing any deadline
256 (in other words, global EDF is not an optimal scheduling algorithm). However,
257 a total utilization smaller than M is enough to guarantee that non real-time
258 tasks are not starved and that the tardiness of real-time tasks has an upper
259 bound[12] (as previously noted). Different bounds on the maximum tardiness
260 experienced by real-time tasks have been developed in various papers[13,14],
261 but the theoretical result that is important for SCHED_DEADLINE is that if
262 the total utilization is smaller or equal than M then the response times of
263 the tasks are limited.
264
2653.4 Relationship with SCHED_DEADLINE Parameters
266------------------------
184 267
185 SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that 268 Finally, it is important to understand the relationship between the
186 the jobs' deadlines of a task are respected. In order to do this, a task 269 SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
187 must be scheduled by setting: 270 deadline and period) and the real-time task parameters (WCET, D, P)
271 described in this section. Note that the tasks' temporal constraints are
272 represented by its absolute deadlines d_j = r_j + D described above, while
273 SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
274 Section 2).
275 If an admission test is used to guarantee that the scheduling deadlines
276 are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
277 guaranteeing that all the jobs' deadlines of a task are respected.
278 In order to do this, a task must be scheduled by setting:
188 279
189 - runtime >= WCET 280 - runtime >= WCET
190 - deadline = D 281 - deadline = D
191 - period <= P 282 - period <= P
192 283
193 IOW, if runtime >= WCET and if period is >= P, then the scheduling deadlines 284 IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
194 and the absolute deadlines (d_j) coincide, so a proper admission control 285 and the absolute deadlines (d_j) coincide, so a proper admission control
195 allows to respect the jobs' absolute deadlines for this task (this is what is 286 allows to respect the jobs' absolute deadlines for this task (this is what is
196 called "hard schedulability property" and is an extension of Lemma 1 of [2]). 287 called "hard schedulability property" and is an extension of Lemma 1 of [2]).
@@ -206,6 +297,39 @@ CONTENTS
206 Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf 297 Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
207 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab 298 3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
208 Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf 299 Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
300 4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
301 Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
302 no. 3, pp. 115-118, 1980.
303 5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
304 Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
305 11th IEEE Real-time Systems Symposium, 1990.
306 6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
307 Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
308 One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
309 1990.
310 7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
311 research, vol. 26, no. 1, pp 127-140, 1978.
312 8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
313 Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
314 9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
315 IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
316 pp 760-768, 2005.
317 10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
318 Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
319 vol. 25, no. 2–3, pp. 187–205, 2003.
320 11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
321 Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
322 http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
323 12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
324 Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
325 no. 2, pp 133-189, 2008.
326 13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
327 Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
328 the 26th IEEE Real-Time Systems Symposium, 2005.
329 14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
330 Global EDF. Proceedings of the 22nd Euromicro Conference on
331 Real-Time Systems, 2010.
332
209 333
2104. Bandwidth management 3344. Bandwidth management
211======================= 335=======================
@@ -218,10 +342,10 @@ CONTENTS
218 no guarantee can be given on the actual scheduling of the -deadline tasks. 342 no guarantee can be given on the actual scheduling of the -deadline tasks.
219 343
220 As already stated in Section 3, a necessary condition to be respected to 344 As already stated in Section 3, a necessary condition to be respected to
221 correctly schedule a set of real-time tasks is that the total utilisation 345 correctly schedule a set of real-time tasks is that the total utilization
222 is smaller than M. When talking about -deadline tasks, this requires that 346 is smaller than M. When talking about -deadline tasks, this requires that
223 the sum of the ratio between runtime and period for all tasks is smaller 347 the sum of the ratio between runtime and period for all tasks is smaller
224 than M. Notice that the ratio runtime/period is equivalent to the utilisation 348 than M. Notice that the ratio runtime/period is equivalent to the utilization
225 of a "traditional" real-time task, and is also often referred to as 349 of a "traditional" real-time task, and is also often referred to as
226 "bandwidth". 350 "bandwidth".
227 The interface used to control the CPU bandwidth that can be allocated 351 The interface used to control the CPU bandwidth that can be allocated
@@ -251,7 +375,7 @@ CONTENTS
251 The system wide settings are configured under the /proc virtual file system. 375 The system wide settings are configured under the /proc virtual file system.
252 376
253 For now the -rt knobs are used for -deadline admission control and the 377 For now the -rt knobs are used for -deadline admission control and the
254 -deadline runtime is accounted against the -rt runtime. We realise that this 378 -deadline runtime is accounted against the -rt runtime. We realize that this
255 isn't entirely desirable; however, it is better to have a small interface for 379 isn't entirely desirable; however, it is better to have a small interface for
256 now, and be able to change it easily later. The ideal situation (see 5.) is to 380 now, and be able to change it easily later. The ideal situation (see 5.) is to
257 run -rt tasks from a -deadline server; in which case the -rt bandwidth is a 381 run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 9d0ac091a52a..4a905bd667e2 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -23,8 +23,7 @@
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/module.h> 25#include <linux/module.h>
26 26#include <linux/uaccess.h>
27#include <asm/uaccess.h>
28 27
29extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *); 28extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
30 29
@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
107 106
108 /* If we're in an interrupt context, or have no user context, 107 /* If we're in an interrupt context, or have no user context,
109 we must not take the fault. */ 108 we must not take the fault. */
110 if (!mm || in_atomic()) 109 if (!mm || faulthandler_disabled())
111 goto no_context; 110 goto no_context;
112 111
113#ifdef CONFIG_ALPHA_LARGE_VMALLOC 112#ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 4dc64ddebece..05b5aaf5b0f9 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
53 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 53 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
54 return -EFAULT; 54 return -EFAULT;
55 55
56 pagefault_disable(); /* implies preempt_disable() */ 56 pagefault_disable();
57 57
58 switch (op) { 58 switch (op) {
59 case FUTEX_OP_SET: 59 case FUTEX_OP_SET:
@@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
75 ret = -ENOSYS; 75 ret = -ENOSYS;
76 } 76 }
77 77
78 pagefault_enable(); /* subsumes preempt_enable() */ 78 pagefault_enable();
79 79
80 if (!ret) { 80 if (!ret) {
81 switch (cmp) { 81 switch (cmp) {
@@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
104 return ret; 104 return ret;
105} 105}
106 106
107/* Compare-xchg with preemption disabled. 107/* Compare-xchg with pagefaults disabled.
108 * Notes: 108 * Notes:
109 * -Best-Effort: Exchg happens only if compare succeeds. 109 * -Best-Effort: Exchg happens only if compare succeeds.
110 * If compare fails, returns; leaving retry/looping to upper layers 110 * If compare fails, returns; leaving retry/looping to upper layers
@@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
121 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 121 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
122 return -EFAULT; 122 return -EFAULT;
123 123
124 pagefault_disable(); /* implies preempt_disable() */ 124 pagefault_disable();
125 125
126 /* TBD : can use llock/scond */ 126 /* TBD : can use llock/scond */
127 __asm__ __volatile__( 127 __asm__ __volatile__(
@@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
142 : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) 142 : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
143 : "cc", "memory"); 143 : "cc", "memory");
144 144
145 pagefault_enable(); /* subsumes preempt_enable() */ 145 pagefault_enable();
146 146
147 *uval = val; 147 *uval = val;
148 return val; 148 return val;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6a2e006cbcce..d948e4e9d89c 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
86 * If we're in an interrupt or have no user 86 * If we're in an interrupt or have no user
87 * context, we must not take the fault.. 87 * context, we must not take the fault..
88 */ 88 */
89 if (in_atomic() || !mm) 89 if (faulthandler_disabled() || !mm)
90 goto no_context; 90 goto no_context;
91 91
92 if (user_mode(regs)) 92 if (user_mode(regs))
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 4e78065a16aa..5eed82809d82 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
93 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 93 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
94 return -EFAULT; 94 return -EFAULT;
95 95
96 preempt_disable();
96 __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" 97 __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
97 "1: " TUSER(ldr) " %1, [%4]\n" 98 "1: " TUSER(ldr) " %1, [%4]\n"
98 " teq %1, %2\n" 99 " teq %1, %2\n"
@@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
104 : "cc", "memory"); 105 : "cc", "memory");
105 106
106 *uval = val; 107 *uval = val;
108 preempt_enable();
109
107 return ret; 110 return ret;
108} 111}
109 112
@@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
124 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 127 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
125 return -EFAULT; 128 return -EFAULT;
126 129
127 pagefault_disable(); /* implies preempt_disable() */ 130#ifndef CONFIG_SMP
131 preempt_disable();
132#endif
133 pagefault_disable();
128 134
129 switch (op) { 135 switch (op) {
130 case FUTEX_OP_SET: 136 case FUTEX_OP_SET:
@@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
146 ret = -ENOSYS; 152 ret = -ENOSYS;
147 } 153 }
148 154
149 pagefault_enable(); /* subsumes preempt_enable() */ 155 pagefault_enable();
156#ifndef CONFIG_SMP
157 preempt_enable();
158#endif
150 159
151 if (!ret) { 160 if (!ret) {
152 switch (cmp) { 161 switch (cmp) {
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 2fe85fff5cca..370f7a732900 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
18#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) 18#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id)
19#define topology_core_id(cpu) (cpu_topology[cpu].core_id) 19#define topology_core_id(cpu) (cpu_topology[cpu].core_id)
20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 21#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
22 22
23void init_cpu_topology(void); 23void init_cpu_topology(void);
24void store_cpu_topology(unsigned int cpuid); 24void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 6333d9c17875..0d629b8f973f 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
276 * If we're in an interrupt or have no user 276 * If we're in an interrupt or have no user
277 * context, we must not take the fault.. 277 * context, we must not take the fault..
278 */ 278 */
279 if (in_atomic() || !mm) 279 if (faulthandler_disabled() || !mm)
280 goto no_context; 280 goto no_context;
281 281
282 if (user_mode(regs)) 282 if (user_mode(regs))
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index b98895d9fe57..ee8dfa793989 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
59 void *kmap; 59 void *kmap;
60 int type; 60 int type;
61 61
62 preempt_disable();
62 pagefault_disable(); 63 pagefault_disable();
63 if (!PageHighMem(page)) 64 if (!PageHighMem(page))
64 return page_address(page); 65 return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
121 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); 122 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
122 } 123 }
123 pagefault_enable(); 124 pagefault_enable();
125 preempt_enable();
124} 126}
125EXPORT_SYMBOL(__kunmap_atomic); 127EXPORT_SYMBOL(__kunmap_atomic);
126 128
@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
130 int idx, type; 132 int idx, type;
131 struct page *page = pfn_to_page(pfn); 133 struct page *page = pfn_to_page(pfn);
132 134
135 preempt_disable();
133 pagefault_disable(); 136 pagefault_disable();
134 if (!PageHighMem(page)) 137 if (!PageHighMem(page))
135 return page_address(page); 138 return page_address(page);
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f750dc96e0f..74069b3bd919 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
58 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 58 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
59 return -EFAULT; 59 return -EFAULT;
60 60
61 pagefault_disable(); /* implies preempt_disable() */ 61 pagefault_disable();
62 62
63 switch (op) { 63 switch (op) {
64 case FUTEX_OP_SET: 64 case FUTEX_OP_SET:
@@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
85 ret = -ENOSYS; 85 ret = -ENOSYS;
86 } 86 }
87 87
88 pagefault_enable(); /* subsumes preempt_enable() */ 88 pagefault_enable();
89 89
90 if (!ret) { 90 if (!ret) {
91 switch (cmp) { 91 switch (cmp) {
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51c..225ec3524fbf 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
18#define topology_physical_package_id(cpu) (cpu_topology[cpu].cluster_id) 18#define topology_physical_package_id(cpu) (cpu_topology[cpu].cluster_id)
19#define topology_core_id(cpu) (cpu_topology[cpu].core_id) 19#define topology_core_id(cpu) (cpu_topology[cpu].core_id)
20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 21#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
22 22
23void init_cpu_topology(void); 23void init_cpu_topology(void);
24void store_cpu_topology(unsigned int cpuid); 24void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 96da13167d4a..0948d327d013 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
211 * If we're in an interrupt or have no user context, we must not take 211 * If we're in an interrupt or have no user context, we must not take
212 * the fault. 212 * the fault.
213 */ 213 */
214 if (in_atomic() || !mm) 214 if (faulthandler_disabled() || !mm)
215 goto no_context; 215 goto no_context;
216 216
217 if (user_mode(regs)) 217 if (user_mode(regs))
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index a46f7cf3e1ea..68cf638faf48 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
97 * @x: Value to copy to user space. 97 * @x: Value to copy to user space.
98 * @ptr: Destination address, in user space. 98 * @ptr: Destination address, in user space.
99 * 99 *
100 * Context: User context only. This function may sleep. 100 * Context: User context only. This function may sleep if pagefaults are
101 * enabled.
101 * 102 *
102 * This macro copies a single simple value from kernel space to user 103 * This macro copies a single simple value from kernel space to user
103 * space. It supports simple types like char and int, but not larger 104 * space. It supports simple types like char and int, but not larger
@@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
116 * @x: Variable to store result. 117 * @x: Variable to store result.
117 * @ptr: Source address, in user space. 118 * @ptr: Source address, in user space.
118 * 119 *
119 * Context: User context only. This function may sleep. 120 * Context: User context only. This function may sleep if pagefaults are
121 * enabled.
120 * 122 *
121 * This macro copies a single simple variable from user space to kernel 123 * This macro copies a single simple variable from user space to kernel
122 * space. It supports simple types like char and int, but not larger 124 * space. It supports simple types like char and int, but not larger
@@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
136 * @x: Value to copy to user space. 138 * @x: Value to copy to user space.
137 * @ptr: Destination address, in user space. 139 * @ptr: Destination address, in user space.
138 * 140 *
139 * Context: User context only. This function may sleep. 141 * Context: User context only. This function may sleep if pagefaults are
142 * enabled.
140 * 143 *
141 * This macro copies a single simple value from kernel space to user 144 * This macro copies a single simple value from kernel space to user
142 * space. It supports simple types like char and int, but not larger 145 * space. It supports simple types like char and int, but not larger
@@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
158 * @x: Variable to store result. 161 * @x: Variable to store result.
159 * @ptr: Source address, in user space. 162 * @ptr: Source address, in user space.
160 * 163 *
161 * Context: User context only. This function may sleep. 164 * Context: User context only. This function may sleep if pagefaults are
165 * enabled.
162 * 166 *
163 * This macro copies a single simple variable from user space to kernel 167 * This macro copies a single simple variable from user space to kernel
164 * space. It supports simple types like char and int, but not larger 168 * space. It supports simple types like char and int, but not larger
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index d223a8b57c1e..c03533937a9f 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -14,11 +14,11 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/kdebug.h> 15#include <linux/kdebug.h>
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/uaccess.h>
17 18
18#include <asm/mmu_context.h> 19#include <asm/mmu_context.h>
19#include <asm/sysreg.h> 20#include <asm/sysreg.h>
20#include <asm/tlb.h> 21#include <asm/tlb.h>
21#include <asm/uaccess.h>
22 22
23#ifdef CONFIG_KPROBES 23#ifdef CONFIG_KPROBES
24static inline int notify_page_fault(struct pt_regs *regs, int trap) 24static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
81 * If we're in an interrupt or have no user context, we must 81 * If we're in an interrupt or have no user context, we must
82 * not take the fault... 82 * not take the fault...
83 */ 83 */
84 if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM)) 84 if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
85 goto no_context; 85 goto no_context;
86 86
87 local_irq_enable(); 87 local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 83f12f2ed9e3..3066d40a6db1 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,7 +8,7 @@
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <asm/uaccess.h> 11#include <linux/uaccess.h>
12#include <arch/system.h> 12#include <arch/system.h>
13 13
14extern int find_fixup_code(struct pt_regs *); 14extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
109 info.si_code = SEGV_MAPERR; 109 info.si_code = SEGV_MAPERR;
110 110
111 /* 111 /*
112 * If we're in an interrupt or "atomic" operation or have no 112 * If we're in an interrupt, have pagefaults disabled or have no
113 * user context, we must not take the fault. 113 * user context, we must not take the fault.
114 */ 114 */
115 115
116 if (in_atomic() || !mm) 116 if (faulthandler_disabled() || !mm)
117 goto no_context; 117 goto no_context;
118 118
119 if (user_mode(regs)) 119 if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index ec4917ddf678..61d99767fe16 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -19,9 +19,9 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/ptrace.h> 20#include <linux/ptrace.h>
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/uaccess.h>
22 23
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/uaccess.h>
25#include <asm/gdb-stub.h> 25#include <asm/gdb-stub.h>
26 26
27/*****************************************************************************/ 27/*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
78 * If we're in an interrupt or have no user 78 * If we're in an interrupt or have no user
79 * context, we must not take the fault.. 79 * context, we must not take the fault..
80 */ 80 */
81 if (in_atomic() || !mm) 81 if (faulthandler_disabled() || !mm)
82 goto no_context; 82 goto no_context;
83 83
84 if (user_mode(__frame)) 84 if (user_mode(__frame))
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index bed9a9bd3c10..785344bbdc07 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
42 unsigned long paddr; 42 unsigned long paddr;
43 int type; 43 int type;
44 44
45 preempt_disable();
45 pagefault_disable(); 46 pagefault_disable();
46 type = kmap_atomic_idx_push(); 47 type = kmap_atomic_idx_push();
47 paddr = page_to_phys(page); 48 paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
85 } 86 }
86 kmap_atomic_idx_pop(); 87 kmap_atomic_idx_pop();
87 pagefault_enable(); 88 pagefault_enable();
89 preempt_enable();
88} 90}
89EXPORT_SYMBOL(__kunmap_atomic); 91EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index e4127e4d6a5b..f000a382bc7f 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -36,7 +36,8 @@
36 * @addr: User space pointer to start of block to check 36 * @addr: User space pointer to start of block to check
37 * @size: Size of block to check 37 * @size: Size of block to check
38 * 38 *
39 * Context: User context only. This function may sleep. 39 * Context: User context only. This function may sleep if pagefaults are
40 * enabled.
40 * 41 *
41 * Checks if a pointer to a block of memory in user space is valid. 42 * Checks if a pointer to a block of memory in user space is valid.
42 * 43 *
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 6437ca21f61b..3ad8f6988363 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
53#define topology_physical_package_id(cpu) (cpu_data(cpu)->socket_id) 53#define topology_physical_package_id(cpu) (cpu_data(cpu)->socket_id)
54#define topology_core_id(cpu) (cpu_data(cpu)->core_id) 54#define topology_core_id(cpu) (cpu_data(cpu)->core_id)
55#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 55#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
56#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) 56#define topology_sibling_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
57#endif 57#endif
58 58
59extern void arch_fix_phys_package_id(int num, u32 slot); 59extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ba5ba7accd0d..70b40d1205a6 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -11,10 +11,10 @@
11#include <linux/kprobes.h> 11#include <linux/kprobes.h>
12#include <linux/kdebug.h> 12#include <linux/kdebug.h>
13#include <linux/prefetch.h> 13#include <linux/prefetch.h>
14#include <linux/uaccess.h>
14 15
15#include <asm/pgtable.h> 16#include <asm/pgtable.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/uaccess.h>
18 18
19extern int die(char *, struct pt_regs *, long); 19extern int die(char *, struct pt_regs *, long);
20 20
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
96 /* 96 /*
97 * If we're in an interrupt or have no user context, we must not take the fault.. 97 * If we're in an interrupt or have no user context, we must not take the fault..
98 */ 98 */
99 if (in_atomic() || !mm) 99 if (faulthandler_disabled() || !mm)
100 goto no_context; 100 goto no_context;
101 101
102#ifdef CONFIG_VIRTUAL_MEM_MAP 102#ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 71adff209405..cac7014daef3 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -91,7 +91,8 @@ static inline void set_fs(mm_segment_t s)
91 * @addr: User space pointer to start of block to check 91 * @addr: User space pointer to start of block to check
92 * @size: Size of block to check 92 * @size: Size of block to check
93 * 93 *
94 * Context: User context only. This function may sleep. 94 * Context: User context only. This function may sleep if pagefaults are
95 * enabled.
95 * 96 *
96 * Checks if a pointer to a block of memory in user space is valid. 97 * Checks if a pointer to a block of memory in user space is valid.
97 * 98 *
@@ -155,7 +156,8 @@ extern int fixup_exception(struct pt_regs *regs);
155 * @x: Variable to store result. 156 * @x: Variable to store result.
156 * @ptr: Source address, in user space. 157 * @ptr: Source address, in user space.
157 * 158 *
158 * Context: User context only. This function may sleep. 159 * Context: User context only. This function may sleep if pagefaults are
160 * enabled.
159 * 161 *
160 * This macro copies a single simple variable from user space to kernel 162 * This macro copies a single simple variable from user space to kernel
161 * space. It supports simple types like char and int, but not larger 163 * space. It supports simple types like char and int, but not larger
@@ -175,7 +177,8 @@ extern int fixup_exception(struct pt_regs *regs);
175 * @x: Value to copy to user space. 177 * @x: Value to copy to user space.
176 * @ptr: Destination address, in user space. 178 * @ptr: Destination address, in user space.
177 * 179 *
178 * Context: User context only. This function may sleep. 180 * Context: User context only. This function may sleep if pagefaults are
181 * enabled.
179 * 182 *
180 * This macro copies a single simple value from kernel space to user 183 * This macro copies a single simple value from kernel space to user
181 * space. It supports simple types like char and int, but not larger 184 * space. It supports simple types like char and int, but not larger
@@ -194,7 +197,8 @@ extern int fixup_exception(struct pt_regs *regs);
194 * @x: Variable to store result. 197 * @x: Variable to store result.
195 * @ptr: Source address, in user space. 198 * @ptr: Source address, in user space.
196 * 199 *
197 * Context: User context only. This function may sleep. 200 * Context: User context only. This function may sleep if pagefaults are
201 * enabled.
198 * 202 *
199 * This macro copies a single simple variable from user space to kernel 203 * This macro copies a single simple variable from user space to kernel
200 * space. It supports simple types like char and int, but not larger 204 * space. It supports simple types like char and int, but not larger
@@ -274,7 +278,8 @@ do { \
274 * @x: Value to copy to user space. 278 * @x: Value to copy to user space.
275 * @ptr: Destination address, in user space. 279 * @ptr: Destination address, in user space.
276 * 280 *
277 * Context: User context only. This function may sleep. 281 * Context: User context only. This function may sleep if pagefaults are
282 * enabled.
278 * 283 *
279 * This macro copies a single simple value from kernel space to user 284 * This macro copies a single simple value from kernel space to user
280 * space. It supports simple types like char and int, but not larger 285 * space. It supports simple types like char and int, but not larger
@@ -568,7 +573,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
568 * @from: Source address, in kernel space. 573 * @from: Source address, in kernel space.
569 * @n: Number of bytes to copy. 574 * @n: Number of bytes to copy.
570 * 575 *
571 * Context: User context only. This function may sleep. 576 * Context: User context only. This function may sleep if pagefaults are
577 * enabled.
572 * 578 *
573 * Copy data from kernel space to user space. Caller must check 579 * Copy data from kernel space to user space. Caller must check
574 * the specified block with access_ok() before calling this function. 580 * the specified block with access_ok() before calling this function.
@@ -588,7 +594,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
588 * @from: Source address, in kernel space. 594 * @from: Source address, in kernel space.
589 * @n: Number of bytes to copy. 595 * @n: Number of bytes to copy.
590 * 596 *
591 * Context: User context only. This function may sleep. 597 * Context: User context only. This function may sleep if pagefaults are
598 * enabled.
592 * 599 *
593 * Copy data from kernel space to user space. 600 * Copy data from kernel space to user space.
594 * 601 *
@@ -606,7 +613,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
606 * @from: Source address, in user space. 613 * @from: Source address, in user space.
607 * @n: Number of bytes to copy. 614 * @n: Number of bytes to copy.
608 * 615 *
609 * Context: User context only. This function may sleep. 616 * Context: User context only. This function may sleep if pagefaults are
617 * enabled.
610 * 618 *
611 * Copy data from user space to kernel space. Caller must check 619 * Copy data from user space to kernel space. Caller must check
612 * the specified block with access_ok() before calling this function. 620 * the specified block with access_ok() before calling this function.
@@ -626,7 +634,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
626 * @from: Source address, in user space. 634 * @from: Source address, in user space.
627 * @n: Number of bytes to copy. 635 * @n: Number of bytes to copy.
628 * 636 *
629 * Context: User context only. This function may sleep. 637 * Context: User context only. This function may sleep if pagefaults are
638 * enabled.
630 * 639 *
631 * Copy data from user space to kernel space. 640 * Copy data from user space to kernel space.
632 * 641 *
@@ -677,7 +686,8 @@ unsigned long clear_user(void __user *mem, unsigned long len);
677 * strlen_user: - Get the size of a string in user space. 686 * strlen_user: - Get the size of a string in user space.
678 * @str: The string to measure. 687 * @str: The string to measure.
679 * 688 *
680 * Context: User context only. This function may sleep. 689 * Context: User context only. This function may sleep if pagefaults are
690 * enabled.
681 * 691 *
682 * Get the size of a NUL-terminated string in user space. 692 * Get the size of a NUL-terminated string in user space.
683 * 693 *
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e3d4d4890104..8f9875b7933d 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -24,9 +24,9 @@
24#include <linux/vt_kern.h> /* For unblank_screen() */ 24#include <linux/vt_kern.h> /* For unblank_screen() */
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/uaccess.h>
27 28
28#include <asm/m32r.h> 29#include <asm/m32r.h>
29#include <asm/uaccess.h>
30#include <asm/hardirq.h> 30#include <asm/hardirq.h>
31#include <asm/mmu_context.h> 31#include <asm/mmu_context.h>
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
@@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
111 mm = tsk->mm; 111 mm = tsk->mm;
112 112
113 /* 113 /*
114 * If we're in an interrupt or have no user context or are running in an 114 * If we're in an interrupt or have no user context or have pagefaults
115 * atomic region then we must not take the fault.. 115 * disabled then we must not take the fault.
116 */ 116 */
117 if (in_atomic() || !mm) 117 if (faulthandler_disabled() || !mm)
118 goto bad_area_nosemaphore; 118 goto bad_area_nosemaphore;
119 119
120 if (error_code & ACE_USERMODE) 120 if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h
index a823cd73dc09..b5941818346f 100644
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -2,9 +2,6 @@
2#define _M68K_IRQFLAGS_H 2#define _M68K_IRQFLAGS_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#ifdef CONFIG_MMU
6#include <linux/preempt_mask.h>
7#endif
8#include <linux/preempt.h> 5#include <linux/preempt.h>
9#include <asm/thread_info.h> 6#include <asm/thread_info.h>
10#include <asm/entry.h> 7#include <asm/entry.h>
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index b2f04aee46ec..6a94cdd0c830 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -10,10 +10,10 @@
10#include <linux/ptrace.h> 10#include <linux/ptrace.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uaccess.h>
13 14
14#include <asm/setup.h> 15#include <asm/setup.h>
15#include <asm/traps.h> 16#include <asm/traps.h>
16#include <asm/uaccess.h>
17#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
18 18
19extern void die_if_kernel(char *, struct pt_regs *, long); 19extern void die_if_kernel(char *, struct pt_regs *, long);
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
81 * If we're in an interrupt or have no user 81 * If we're in an interrupt or have no user
82 * context, we must not take the fault.. 82 * context, we must not take the fault..
83 */ 83 */
84 if (in_atomic() || !mm) 84 if (faulthandler_disabled() || !mm)
85 goto no_context; 85 goto no_context;
86 86
87 if (user_mode(regs)) 87 if (user_mode(regs))
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2de5dc695a87..f57edca63609 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
105 105
106 mm = tsk->mm; 106 mm = tsk->mm;
107 107
108 if (in_atomic() || !mm) 108 if (faulthandler_disabled() || !mm)
109 goto no_context; 109 goto no_context;
110 110
111 if (user_mode(regs)) 111 if (user_mode(regs))
diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c
index d71f621a2c0b..807f1b1c4e65 100644
--- a/arch/metag/mm/highmem.c
+++ b/arch/metag/mm/highmem.c
@@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page)
43 unsigned long vaddr; 43 unsigned long vaddr;
44 int type; 44 int type;
45 45
46 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 46 preempt_disable();
47 pagefault_disable(); 47 pagefault_disable();
48 if (!PageHighMem(page)) 48 if (!PageHighMem(page))
49 return page_address(page); 49 return page_address(page);
@@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr)
82 } 82 }
83 83
84 pagefault_enable(); 84 pagefault_enable();
85 preempt_enable();
85} 86}
86EXPORT_SYMBOL(__kunmap_atomic); 87EXPORT_SYMBOL(__kunmap_atomic);
87 88
@@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
95 unsigned long vaddr; 96 unsigned long vaddr;
96 int type; 97 int type;
97 98
99 preempt_disable();
98 pagefault_disable(); 100 pagefault_disable();
99 101
100 type = kmap_atomic_idx_push(); 102 type = kmap_atomic_idx_push();
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 62942fd12672..331b0d35f89c 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -178,7 +178,8 @@ extern long __user_bad(void);
178 * @x: Variable to store result. 178 * @x: Variable to store result.
179 * @ptr: Source address, in user space. 179 * @ptr: Source address, in user space.
180 * 180 *
181 * Context: User context only. This function may sleep. 181 * Context: User context only. This function may sleep if pagefaults are
182 * enabled.
182 * 183 *
183 * This macro copies a single simple variable from user space to kernel 184 * This macro copies a single simple variable from user space to kernel
184 * space. It supports simple types like char and int, but not larger 185 * space. It supports simple types like char and int, but not larger
@@ -290,7 +291,8 @@ extern long __user_bad(void);
290 * @x: Value to copy to user space. 291 * @x: Value to copy to user space.
291 * @ptr: Destination address, in user space. 292 * @ptr: Destination address, in user space.
292 * 293 *
293 * Context: User context only. This function may sleep. 294 * Context: User context only. This function may sleep if pagefaults are
295 * enabled.
294 * 296 *
295 * This macro copies a single simple value from kernel space to user 297 * This macro copies a single simple value from kernel space to user
296 * space. It supports simple types like char and int, but not larger 298 * space. It supports simple types like char and int, but not larger
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d46a5ebb7570..177dfc003643 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
107 if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11) 107 if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
108 is_write = 0; 108 is_write = 0;
109 109
110 if (unlikely(in_atomic() || !mm)) { 110 if (unlikely(faulthandler_disabled() || !mm)) {
111 if (kernel_mode(regs)) 111 if (kernel_mode(regs))
112 goto bad_area_nosemaphore; 112 goto bad_area_nosemaphore;
113 113
114 /* in_atomic() in user mode is really bad, 114 /* faulthandler_disabled() in user mode is really bad,
115 as is current->mm == NULL. */ 115 as is current->mm == NULL. */
116 pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n", 116 pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n",
117 mm); 117 mm);
118 pr_emerg("r15 = %lx MSR = %lx\n", 118 pr_emerg("r15 = %lx MSR = %lx\n",
119 regs->r15, regs->msr); 119 regs->r15, regs->msr);
120 die("Weird page fault", regs, SIGSEGV); 120 die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c
index 5a92576fad92..2fcc5a52d84d 100644
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
37 unsigned long vaddr; 37 unsigned long vaddr;
38 int idx, type; 38 int idx, type;
39 39
40 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 40 preempt_disable();
41 pagefault_disable(); 41 pagefault_disable();
42 if (!PageHighMem(page)) 42 if (!PageHighMem(page))
43 return page_address(page); 43 return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr)
63 63
64 if (vaddr < __fix_to_virt(FIX_KMAP_END)) { 64 if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
65 pagefault_enable(); 65 pagefault_enable();
66 preempt_enable();
66 return; 67 return;
67 } 68 }
68 69
@@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr)
84#endif 85#endif
85 kmap_atomic_idx_pop(); 86 kmap_atomic_idx_pop();
86 pagefault_enable(); 87 pagefault_enable();
88 preempt_enable();
87} 89}
88EXPORT_SYMBOL(__kunmap_atomic); 90EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 3e307ec2afba..7afda4150a59 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -15,7 +15,7 @@
15#define topology_physical_package_id(cpu) (cpu_data[cpu].package) 15#define topology_physical_package_id(cpu) (cpu_data[cpu].package)
16#define topology_core_id(cpu) (cpu_data[cpu].core) 16#define topology_core_id(cpu) (cpu_data[cpu].core)
17#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 17#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
18#define topology_thread_cpumask(cpu) (&cpu_sibling_map[cpu]) 18#define topology_sibling_cpumask(cpu) (&cpu_sibling_map[cpu])
19#endif 19#endif
20 20
21#endif /* __ASM_TOPOLOGY_H */ 21#endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index bf8b32450ef6..9722357d2854 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -103,7 +103,8 @@ extern u64 __ua_limit;
103 * @addr: User space pointer to start of block to check 103 * @addr: User space pointer to start of block to check
104 * @size: Size of block to check 104 * @size: Size of block to check
105 * 105 *
106 * Context: User context only. This function may sleep. 106 * Context: User context only. This function may sleep if pagefaults are
107 * enabled.
107 * 108 *
108 * Checks if a pointer to a block of memory in user space is valid. 109 * Checks if a pointer to a block of memory in user space is valid.
109 * 110 *
@@ -138,7 +139,8 @@ extern u64 __ua_limit;
138 * @x: Value to copy to user space. 139 * @x: Value to copy to user space.
139 * @ptr: Destination address, in user space. 140 * @ptr: Destination address, in user space.
140 * 141 *
141 * Context: User context only. This function may sleep. 142 * Context: User context only. This function may sleep if pagefaults are
143 * enabled.
142 * 144 *
143 * This macro copies a single simple value from kernel space to user 145 * This macro copies a single simple value from kernel space to user
144 * space. It supports simple types like char and int, but not larger 146 * space. It supports simple types like char and int, but not larger
@@ -157,7 +159,8 @@ extern u64 __ua_limit;
157 * @x: Variable to store result. 159 * @x: Variable to store result.
158 * @ptr: Source address, in user space. 160 * @ptr: Source address, in user space.
159 * 161 *
160 * Context: User context only. This function may sleep. 162 * Context: User context only. This function may sleep if pagefaults are
163 * enabled.
161 * 164 *
162 * This macro copies a single simple variable from user space to kernel 165 * This macro copies a single simple variable from user space to kernel
163 * space. It supports simple types like char and int, but not larger 166 * space. It supports simple types like char and int, but not larger
@@ -177,7 +180,8 @@ extern u64 __ua_limit;
177 * @x: Value to copy to user space. 180 * @x: Value to copy to user space.
178 * @ptr: Destination address, in user space. 181 * @ptr: Destination address, in user space.
179 * 182 *
180 * Context: User context only. This function may sleep. 183 * Context: User context only. This function may sleep if pagefaults are
184 * enabled.
181 * 185 *
182 * This macro copies a single simple value from kernel space to user 186 * This macro copies a single simple value from kernel space to user
183 * space. It supports simple types like char and int, but not larger 187 * space. It supports simple types like char and int, but not larger
@@ -199,7 +203,8 @@ extern u64 __ua_limit;
199 * @x: Variable to store result. 203 * @x: Variable to store result.
200 * @ptr: Source address, in user space. 204 * @ptr: Source address, in user space.
201 * 205 *
202 * Context: User context only. This function may sleep. 206 * Context: User context only. This function may sleep if pagefaults are
207 * enabled.
203 * 208 *
204 * This macro copies a single simple variable from user space to kernel 209 * This macro copies a single simple variable from user space to kernel
205 * space. It supports simple types like char and int, but not larger 210 * space. It supports simple types like char and int, but not larger
@@ -498,7 +503,8 @@ extern void __put_user_unknown(void);
498 * @x: Value to copy to user space. 503 * @x: Value to copy to user space.
499 * @ptr: Destination address, in user space. 504 * @ptr: Destination address, in user space.
500 * 505 *
501 * Context: User context only. This function may sleep. 506 * Context: User context only. This function may sleep if pagefaults are
507 * enabled.
502 * 508 *
503 * This macro copies a single simple value from kernel space to user 509 * This macro copies a single simple value from kernel space to user
504 * space. It supports simple types like char and int, but not larger 510 * space. It supports simple types like char and int, but not larger
@@ -517,7 +523,8 @@ extern void __put_user_unknown(void);
517 * @x: Variable to store result. 523 * @x: Variable to store result.
518 * @ptr: Source address, in user space. 524 * @ptr: Source address, in user space.
519 * 525 *
520 * Context: User context only. This function may sleep. 526 * Context: User context only. This function may sleep if pagefaults are
527 * enabled.
521 * 528 *
522 * This macro copies a single simple variable from user space to kernel 529 * This macro copies a single simple variable from user space to kernel
523 * space. It supports simple types like char and int, but not larger 530 * space. It supports simple types like char and int, but not larger
@@ -537,7 +544,8 @@ extern void __put_user_unknown(void);
537 * @x: Value to copy to user space. 544 * @x: Value to copy to user space.
538 * @ptr: Destination address, in user space. 545 * @ptr: Destination address, in user space.
539 * 546 *
540 * Context: User context only. This function may sleep. 547 * Context: User context only. This function may sleep if pagefaults are
548 * enabled.
541 * 549 *
542 * This macro copies a single simple value from kernel space to user 550 * This macro copies a single simple value from kernel space to user
543 * space. It supports simple types like char and int, but not larger 551 * space. It supports simple types like char and int, but not larger
@@ -559,7 +567,8 @@ extern void __put_user_unknown(void);
559 * @x: Variable to store result. 567 * @x: Variable to store result.
560 * @ptr: Source address, in user space. 568 * @ptr: Source address, in user space.
561 * 569 *
562 * Context: User context only. This function may sleep. 570 * Context: User context only. This function may sleep if pagefaults are
571 * enabled.
563 * 572 *
564 * This macro copies a single simple variable from user space to kernel 573 * This macro copies a single simple variable from user space to kernel
565 * space. It supports simple types like char and int, but not larger 574 * space. It supports simple types like char and int, but not larger
@@ -815,7 +824,8 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n);
815 * @from: Source address, in kernel space. 824 * @from: Source address, in kernel space.
816 * @n: Number of bytes to copy. 825 * @n: Number of bytes to copy.
817 * 826 *
818 * Context: User context only. This function may sleep. 827 * Context: User context only. This function may sleep if pagefaults are
828 * enabled.
819 * 829 *
820 * Copy data from kernel space to user space. Caller must check 830 * Copy data from kernel space to user space. Caller must check
821 * the specified block with access_ok() before calling this function. 831 * the specified block with access_ok() before calling this function.
@@ -888,7 +898,8 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
888 * @from: Source address, in kernel space. 898 * @from: Source address, in kernel space.
889 * @n: Number of bytes to copy. 899 * @n: Number of bytes to copy.
890 * 900 *
891 * Context: User context only. This function may sleep. 901 * Context: User context only. This function may sleep if pagefaults are
902 * enabled.
892 * 903 *
893 * Copy data from kernel space to user space. 904 * Copy data from kernel space to user space.
894 * 905 *
@@ -1075,7 +1086,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
1075 * @from: Source address, in user space. 1086 * @from: Source address, in user space.
1076 * @n: Number of bytes to copy. 1087 * @n: Number of bytes to copy.
1077 * 1088 *
1078 * Context: User context only. This function may sleep. 1089 * Context: User context only. This function may sleep if pagefaults are
1090 * enabled.
1079 * 1091 *
1080 * Copy data from user space to kernel space. Caller must check 1092 * Copy data from user space to kernel space. Caller must check
1081 * the specified block with access_ok() before calling this function. 1093 * the specified block with access_ok() before calling this function.
@@ -1107,7 +1119,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
1107 * @from: Source address, in user space. 1119 * @from: Source address, in user space.
1108 * @n: Number of bytes to copy. 1120 * @n: Number of bytes to copy.
1109 * 1121 *
1110 * Context: User context only. This function may sleep. 1122 * Context: User context only. This function may sleep if pagefaults are
1123 * enabled.
1111 * 1124 *
1112 * Copy data from user space to kernel space. 1125 * Copy data from user space to kernel space.
1113 * 1126 *
@@ -1329,7 +1342,8 @@ strncpy_from_user(char *__to, const char __user *__from, long __len)
1329 * strlen_user: - Get the size of a string in user space. 1342 * strlen_user: - Get the size of a string in user space.
1330 * @str: The string to measure. 1343 * @str: The string to measure.
1331 * 1344 *
1332 * Context: User context only. This function may sleep. 1345 * Context: User context only. This function may sleep if pagefaults are
1346 * enabled.
1333 * 1347 *
1334 * Get the size of a NUL-terminated string in user space. 1348 * Get the size of a NUL-terminated string in user space.
1335 * 1349 *
@@ -1398,7 +1412,8 @@ static inline long __strnlen_user(const char __user *s, long n)
1398 * strnlen_user: - Get the size of a string in user space. 1412 * strnlen_user: - Get the size of a string in user space.
1399 * @str: The string to measure. 1413 * @str: The string to measure.
1400 * 1414 *
1401 * Context: User context only. This function may sleep. 1415 * Context: User context only. This function may sleep if pagefaults are
1416 * enabled.
1402 * 1417 *
1403 * Get the size of a NUL-terminated string in user space. 1418 * Get the size of a NUL-terminated string in user space.
1404 * 1419 *
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h
index 06805e09bcd3..0b85f827cd18 100644
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -28,12 +28,7 @@ extern void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
28extern int fpcsr_pending(unsigned int __user *fpcsr); 28extern int fpcsr_pending(unsigned int __user *fpcsr);
29 29
30/* Make sure we will not lose FPU ownership */ 30/* Make sure we will not lose FPU ownership */
31#ifdef CONFIG_PREEMPT 31#define lock_fpu_owner() ({ preempt_disable(); pagefault_disable(); })
32#define lock_fpu_owner() preempt_disable() 32#define unlock_fpu_owner() ({ pagefault_enable(); preempt_enable(); })
33#define unlock_fpu_owner() preempt_enable()
34#else
35#define lock_fpu_owner() pagefault_disable()
36#define unlock_fpu_owner() pagefault_enable()
37#endif
38 33
39#endif /* __SIGNAL_COMMON_H */ 34#endif /* __SIGNAL_COMMON_H */
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ff8637e530d..36c0f26fac6b 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -21,10 +21,10 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/kprobes.h> 22#include <linux/kprobes.h>
23#include <linux/perf_event.h> 23#include <linux/perf_event.h>
24#include <linux/uaccess.h>
24 25
25#include <asm/branch.h> 26#include <asm/branch.h>
26#include <asm/mmu_context.h> 27#include <asm/mmu_context.h>
27#include <asm/uaccess.h>
28#include <asm/ptrace.h> 28#include <asm/ptrace.h>
29#include <asm/highmem.h> /* For VMALLOC_END */ 29#include <asm/highmem.h> /* For VMALLOC_END */
30#include <linux/kdebug.h> 30#include <linux/kdebug.h>
@@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
94 * If we're in an interrupt or have no user 94 * If we're in an interrupt or have no user
95 * context, we must not take the fault.. 95 * context, we must not take the fault..
96 */ 96 */
97 if (in_atomic() || !mm) 97 if (faulthandler_disabled() || !mm)
98 goto bad_area_nosemaphore; 98 goto bad_area_nosemaphore;
99 99
100 if (user_mode(regs)) 100 if (user_mode(regs))
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index da815d295239..11661cbc11a8 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page)
47 unsigned long vaddr; 47 unsigned long vaddr;
48 int idx, type; 48 int idx, type;
49 49
50 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 50 preempt_disable();
51 pagefault_disable(); 51 pagefault_disable();
52 if (!PageHighMem(page)) 52 if (!PageHighMem(page))
53 return page_address(page); 53 return page_address(page);
@@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr)
72 72
73 if (vaddr < FIXADDR_START) { // FIXME 73 if (vaddr < FIXADDR_START) { // FIXME
74 pagefault_enable(); 74 pagefault_enable();
75 preempt_enable();
75 return; 76 return;
76 } 77 }
77 78
@@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr)
92#endif 93#endif
93 kmap_atomic_idx_pop(); 94 kmap_atomic_idx_pop();
94 pagefault_enable(); 95 pagefault_enable();
96 preempt_enable();
95} 97}
96EXPORT_SYMBOL(__kunmap_atomic); 98EXPORT_SYMBOL(__kunmap_atomic);
97 99
@@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
104 unsigned long vaddr; 106 unsigned long vaddr;
105 int idx, type; 107 int idx, type;
106 108
109 preempt_disable();
107 pagefault_disable(); 110 pagefault_disable();
108 111
109 type = kmap_atomic_idx_push(); 112 type = kmap_atomic_idx_push();
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index faa5c9822ecc..198a3147dd7d 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -90,6 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
90 90
91 BUG_ON(Page_dcache_dirty(page)); 91 BUG_ON(Page_dcache_dirty(page));
92 92
93 preempt_disable();
93 pagefault_disable(); 94 pagefault_disable();
94 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); 95 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
95 idx += in_interrupt() ? FIX_N_COLOURS : 0; 96 idx += in_interrupt() ? FIX_N_COLOURS : 0;
@@ -152,6 +153,7 @@ void kunmap_coherent(void)
152 write_c0_entryhi(old_ctx); 153 write_c0_entryhi(old_ctx);
153 local_irq_restore(flags); 154 local_irq_restore(flags);
154 pagefault_enable(); 155 pagefault_enable();
156 preempt_enable();
155} 157}
156 158
157void copy_user_highpage(struct page *to, struct page *from, 159void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 2fbbe4d920aa..1ddea5afba09 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page)
75 unsigned long vaddr; 75 unsigned long vaddr;
76 int idx, type; 76 int idx, type;
77 77
78 preempt_disable();
78 pagefault_disable(); 79 pagefault_disable();
79 if (page < highmem_start_page) 80 if (page < highmem_start_page)
80 return page_address(page); 81 return page_address(page);
@@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
98 99
99 if (vaddr < FIXADDR_START) { /* FIXME */ 100 if (vaddr < FIXADDR_START) { /* FIXME */
100 pagefault_enable(); 101 pagefault_enable();
102 preempt_enable();
101 return; 103 return;
102 } 104 }
103 105
@@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
122 124
123 kmap_atomic_idx_pop(); 125 kmap_atomic_idx_pop();
124 pagefault_enable(); 126 pagefault_enable();
127 preempt_enable();
125} 128}
126#endif /* __KERNEL__ */ 129#endif /* __KERNEL__ */
127 130
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 0c2cc5d39c8e..4a1d181ed32f 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -23,8 +23,8 @@
23#include <linux/interrupt.h> 23#include <linux/interrupt.h>
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/vt_kern.h> /* For unblank_screen() */ 25#include <linux/vt_kern.h> /* For unblank_screen() */
26#include <linux/uaccess.h>
26 27
27#include <asm/uaccess.h>
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29#include <asm/hardirq.h> 29#include <asm/hardirq.h>
30#include <asm/cpu-regs.h> 30#include <asm/cpu-regs.h>
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
168 * If we're in an interrupt or have no user 168 * If we're in an interrupt or have no user
169 * context, we must not take the fault.. 169 * context, we must not take the fault..
170 */ 170 */
171 if (in_atomic() || !mm) 171 if (faulthandler_disabled() || !mm)
172 goto no_context; 172 goto no_context;
173 173
174 if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) 174 if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 0c9b6afe69e9..b51878b0c6b8 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
77 * If we're in an interrupt or have no user 77 * If we're in an interrupt or have no user
78 * context, we must not take the fault.. 78 * context, we must not take the fault..
79 */ 79 */
80 if (in_atomic() || !mm) 80 if (faulthandler_disabled() || !mm)
81 goto bad_area_nosemaphore; 81 goto bad_area_nosemaphore;
82 82
83 if (user_mode(regs)) 83 if (user_mode(regs))
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index de65f66ea64e..ec2df4bab302 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -142,6 +142,7 @@ static inline void kunmap(struct page *page)
142 142
143static inline void *kmap_atomic(struct page *page) 143static inline void *kmap_atomic(struct page *page)
144{ 144{
145 preempt_disable();
145 pagefault_disable(); 146 pagefault_disable();
146 return page_address(page); 147 return page_address(page);
147} 148}
@@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr)
150{ 151{
151 flush_kernel_dcache_page_addr(addr); 152 flush_kernel_dcache_page_addr(addr);
152 pagefault_enable(); 153 pagefault_enable();
154 preempt_enable();
153} 155}
154 156
155#define kmap_atomic_prot(page, prot) kmap_atomic(page) 157#define kmap_atomic_prot(page, prot) kmap_atomic(page)
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 47ee620d15d2..6548fd1d2e62 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -26,9 +26,9 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/bug.h> 27#include <linux/bug.h>
28#include <linux/ratelimit.h> 28#include <linux/ratelimit.h>
29#include <linux/uaccess.h>
29 30
30#include <asm/assembly.h> 31#include <asm/assembly.h>
31#include <asm/uaccess.h>
32#include <asm/io.h> 32#include <asm/io.h>
33#include <asm/irq.h> 33#include <asm/irq.h>
34#include <asm/traps.h> 34#include <asm/traps.h>
@@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
800 * unless pagefault_disable() was called before. 800 * unless pagefault_disable() was called before.
801 */ 801 */
802 802
803 if (fault_space == 0 && !in_atomic()) 803 if (fault_space == 0 && !faulthandler_disabled())
804 { 804 {
805 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC); 805 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
806 parisc_terminate("Kernel Fault", regs, code, fault_address); 806 parisc_terminate("Kernel Fault", regs, code, fault_address);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e5120e653240..15503adddf4f 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -15,8 +15,8 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/uaccess.h>
18 19
19#include <asm/uaccess.h>
20#include <asm/traps.h> 20#include <asm/traps.h>
21 21
22/* Various important other fields */ 22/* Various important other fields */
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
207 int fault; 207 int fault;
208 unsigned int flags; 208 unsigned int flags;
209 209
210 if (in_atomic()) 210 if (pagefault_disabled())
211 goto no_context; 211 goto no_context;
212 212
213 tsk = current; 213 tsk = current;
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 5f1048eaa5b6..8b3b46b7b0f2 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,7 +87,7 @@ static inline int prrn_is_enabled(void)
87#include <asm/smp.h> 87#include <asm/smp.h>
88 88
89#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu)) 89#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu))
90#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 90#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
91#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 91#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
92#define topology_core_id(cpu) (cpu_to_core_id(cpu)) 92#define topology_core_id(cpu) (cpu_to_core_id(cpu))
93#endif 93#endif
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index 3cf529ceec5b..ac93a3bd2730 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -27,11 +27,11 @@ int enter_vmx_usercopy(void)
27 if (in_interrupt()) 27 if (in_interrupt())
28 return 0; 28 return 0;
29 29
30 /* This acts as preempt_disable() as well and will make 30 preempt_disable();
31 * enable_kernel_altivec(). We need to disable page faults 31 /*
32 * as they can call schedule and thus make us lose the VMX 32 * We need to disable page faults as they can call schedule and
33 * context. So on page faults, we just fail which will cause 33 * thus make us lose the VMX context. So on page faults, we just
34 * a fallback to the normal non-vmx copy. 34 * fail which will cause a fallback to the normal non-vmx copy.
35 */ 35 */
36 pagefault_disable(); 36 pagefault_disable();
37 37
@@ -47,6 +47,7 @@ int enter_vmx_usercopy(void)
47int exit_vmx_usercopy(void) 47int exit_vmx_usercopy(void)
48{ 48{
49 pagefault_enable(); 49 pagefault_enable();
50 preempt_enable();
50 return 0; 51 return 0;
51} 52}
52 53
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868d2aa7..6d535973b200 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,13 +33,13 @@
33#include <linux/ratelimit.h> 33#include <linux/ratelimit.h>
34#include <linux/context_tracking.h> 34#include <linux/context_tracking.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/uaccess.h>
36 37
37#include <asm/firmware.h> 38#include <asm/firmware.h>
38#include <asm/page.h> 39#include <asm/page.h>
39#include <asm/pgtable.h> 40#include <asm/pgtable.h>
40#include <asm/mmu.h> 41#include <asm/mmu.h>
41#include <asm/mmu_context.h> 42#include <asm/mmu_context.h>
42#include <asm/uaccess.h>
43#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
44#include <asm/siginfo.h> 44#include <asm/siginfo.h>
45#include <asm/debug.h> 45#include <asm/debug.h>
@@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
272 if (!arch_irq_disabled_regs(regs)) 272 if (!arch_irq_disabled_regs(regs))
273 local_irq_enable(); 273 local_irq_enable();
274 274
275 if (in_atomic() || mm == NULL) { 275 if (faulthandler_disabled() || mm == NULL) {
276 if (!user_mode(regs)) { 276 if (!user_mode(regs)) {
277 rc = SIGSEGV; 277 rc = SIGSEGV;
278 goto bail; 278 goto bail;
279 } 279 }
280 /* in_atomic() in user mode is really bad, 280 /* faulthandler_disabled() in user mode is really bad,
281 as is current->mm == NULL. */ 281 as is current->mm == NULL. */
282 printk(KERN_EMERG "Page fault in user mode with " 282 printk(KERN_EMERG "Page fault in user mode with "
283 "in_atomic() = %d mm = %p\n", in_atomic(), mm); 283 "faulthandler_disabled() = %d mm = %p\n",
284 faulthandler_disabled(), mm);
284 printk(KERN_EMERG "NIP = %lx MSR = %lx\n", 285 printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
285 regs->nip, regs->msr); 286 regs->nip, regs->msr);
286 die("Weird page fault", regs, SIGSEGV); 287 die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index e7450bdbe83a..e292c8a60952 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
34 unsigned long vaddr; 34 unsigned long vaddr;
35 int idx, type; 35 int idx, type;
36 36
37 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 37 preempt_disable();
38 pagefault_disable(); 38 pagefault_disable();
39 if (!PageHighMem(page)) 39 if (!PageHighMem(page))
40 return page_address(page); 40 return page_address(page);
@@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr)
59 59
60 if (vaddr < __fix_to_virt(FIX_KMAP_END)) { 60 if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
61 pagefault_enable(); 61 pagefault_enable();
62 preempt_enable();
62 return; 63 return;
63 } 64 }
64 65
@@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr)
82 83
83 kmap_atomic_idx_pop(); 84 kmap_atomic_idx_pop();
84 pagefault_enable(); 85 pagefault_enable();
86 preempt_enable();
85} 87}
86EXPORT_SYMBOL(__kunmap_atomic); 88EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index cbd3d069897f..723a099f6be3 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -217,7 +217,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
217static int mm_is_core_local(struct mm_struct *mm) 217static int mm_is_core_local(struct mm_struct *mm)
218{ 218{
219 return cpumask_subset(mm_cpumask(mm), 219 return cpumask_subset(mm_cpumask(mm),
220 topology_thread_cpumask(smp_processor_id())); 220 topology_sibling_cpumask(smp_processor_id()));
221} 221}
222 222
223struct tlb_flush_param { 223struct tlb_flush_param {
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index b1453a2ae1ca..4990f6c66288 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,7 +22,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
22 22
23#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id) 23#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
24#define topology_thread_id(cpu) (per_cpu(cpu_topology, cpu).thread_id) 24#define topology_thread_id(cpu) (per_cpu(cpu_topology, cpu).thread_id)
25#define topology_thread_cpumask(cpu) (&per_cpu(cpu_topology, cpu).thread_mask) 25#define topology_sibling_cpumask(cpu) \
26 (&per_cpu(cpu_topology, cpu).thread_mask)
26#define topology_core_id(cpu) (per_cpu(cpu_topology, cpu).core_id) 27#define topology_core_id(cpu) (per_cpu(cpu_topology, cpu).core_id)
27#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask) 28#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask)
28#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id) 29#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id)
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index d64a7a62164f..9dd4cc47ddc7 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -98,7 +98,8 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
98 * @from: Source address, in user space. 98 * @from: Source address, in user space.
99 * @n: Number of bytes to copy. 99 * @n: Number of bytes to copy.
100 * 100 *
101 * Context: User context only. This function may sleep. 101 * Context: User context only. This function may sleep if pagefaults are
102 * enabled.
102 * 103 *
103 * Copy data from user space to kernel space. Caller must check 104 * Copy data from user space to kernel space. Caller must check
104 * the specified block with access_ok() before calling this function. 105 * the specified block with access_ok() before calling this function.
@@ -118,7 +119,8 @@ unsigned long __must_check __copy_from_user(void *to, const void __user *from,
118 * @from: Source address, in kernel space. 119 * @from: Source address, in kernel space.
119 * @n: Number of bytes to copy. 120 * @n: Number of bytes to copy.
120 * 121 *
121 * Context: User context only. This function may sleep. 122 * Context: User context only. This function may sleep if pagefaults are
123 * enabled.
122 * 124 *
123 * Copy data from kernel space to user space. Caller must check 125 * Copy data from kernel space to user space. Caller must check
124 * the specified block with access_ok() before calling this function. 126 * the specified block with access_ok() before calling this function.
@@ -264,7 +266,8 @@ int __get_user_bad(void) __attribute__((noreturn));
264 * @from: Source address, in kernel space. 266 * @from: Source address, in kernel space.
265 * @n: Number of bytes to copy. 267 * @n: Number of bytes to copy.
266 * 268 *
267 * Context: User context only. This function may sleep. 269 * Context: User context only. This function may sleep if pagefaults are
270 * enabled.
268 * 271 *
269 * Copy data from kernel space to user space. 272 * Copy data from kernel space to user space.
270 * 273 *
@@ -290,7 +293,8 @@ __compiletime_warning("copy_from_user() buffer size is not provably correct")
290 * @from: Source address, in user space. 293 * @from: Source address, in user space.
291 * @n: Number of bytes to copy. 294 * @n: Number of bytes to copy.
292 * 295 *
293 * Context: User context only. This function may sleep. 296 * Context: User context only. This function may sleep if pagefaults are
297 * enabled.
294 * 298 *
295 * Copy data from user space to kernel space. 299 * Copy data from user space to kernel space.
296 * 300 *
@@ -348,7 +352,8 @@ static inline unsigned long strnlen_user(const char __user *src, unsigned long n
348 * strlen_user: - Get the size of a string in user space. 352 * strlen_user: - Get the size of a string in user space.
349 * @str: The string to measure. 353 * @str: The string to measure.
350 * 354 *
351 * Context: User context only. This function may sleep. 355 * Context: User context only. This function may sleep if pagefaults are
356 * enabled.
352 * 357 *
353 * Get the size of a NUL-terminated string in user space. 358 * Get the size of a NUL-terminated string in user space.
354 * 359 *
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 76515bcea2f1..4c8f5d7f9c23 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
399 * user context. 399 * user context.
400 */ 400 */
401 fault = VM_FAULT_BADCONTEXT; 401 fault = VM_FAULT_BADCONTEXT;
402 if (unlikely(!user_space_fault(regs) || in_atomic() || !mm)) 402 if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
403 goto out; 403 goto out;
404 404
405 address = trans_exc_code & __FAIL_ADDR_MASK; 405 address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index ab66ddde777b..20a3591225cc 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -36,7 +36,8 @@
36 * @addr: User space pointer to start of block to check 36 * @addr: User space pointer to start of block to check
37 * @size: Size of block to check 37 * @size: Size of block to check
38 * 38 *
39 * Context: User context only. This function may sleep. 39 * Context: User context only. This function may sleep if pagefaults are
40 * enabled.
40 * 41 *
41 * Checks if a pointer to a block of memory in user space is valid. 42 * Checks if a pointer to a block of memory in user space is valid.
42 * 43 *
@@ -61,7 +62,8 @@
61 * @x: Value to copy to user space. 62 * @x: Value to copy to user space.
62 * @ptr: Destination address, in user space. 63 * @ptr: Destination address, in user space.
63 * 64 *
64 * Context: User context only. This function may sleep. 65 * Context: User context only. This function may sleep if pagefaults are
66 * enabled.
65 * 67 *
66 * This macro copies a single simple value from kernel space to user 68 * This macro copies a single simple value from kernel space to user
67 * space. It supports simple types like char and int, but not larger 69 * space. It supports simple types like char and int, but not larger
@@ -79,7 +81,8 @@
79 * @x: Variable to store result. 81 * @x: Variable to store result.
80 * @ptr: Source address, in user space. 82 * @ptr: Source address, in user space.
81 * 83 *
82 * Context: User context only. This function may sleep. 84 * Context: User context only. This function may sleep if pagefaults are
85 * enabled.
83 * 86 *
84 * This macro copies a single simple variable from user space to kernel 87 * This macro copies a single simple variable from user space to kernel
85 * space. It supports simple types like char and int, but not larger 88 * space. It supports simple types like char and int, but not larger
@@ -98,7 +101,8 @@
98 * @x: Value to copy to user space. 101 * @x: Value to copy to user space.
99 * @ptr: Destination address, in user space. 102 * @ptr: Destination address, in user space.
100 * 103 *
101 * Context: User context only. This function may sleep. 104 * Context: User context only. This function may sleep if pagefaults are
105 * enabled.
102 * 106 *
103 * This macro copies a single simple value from kernel space to user 107 * This macro copies a single simple value from kernel space to user
104 * space. It supports simple types like char and int, but not larger 108 * space. It supports simple types like char and int, but not larger
@@ -119,7 +123,8 @@
119 * @x: Variable to store result. 123 * @x: Variable to store result.
120 * @ptr: Source address, in user space. 124 * @ptr: Source address, in user space.
121 * 125 *
122 * Context: User context only. This function may sleep. 126 * Context: User context only. This function may sleep if pagefaults are
127 * enabled.
123 * 128 *
124 * This macro copies a single simple variable from user space to kernel 129 * This macro copies a single simple variable from user space to kernel
125 * space. It supports simple types like char and int, but not larger 130 * space. It supports simple types like char and int, but not larger
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 6860beb2a280..37a6c2e0e969 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -34,6 +34,7 @@
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/types.h> 35#include <linux/types.h>
36#include <linux/ptrace.h> 36#include <linux/ptrace.h>
37#include <linux/uaccess.h>
37 38
38/* 39/*
39 * This routine handles page faults. It determines the address, 40 * This routine handles page faults. It determines the address,
@@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
73 * If we're in an interrupt or have no user 74 * If we're in an interrupt or have no user
74 * context, we must not take the fault.. 75 * context, we must not take the fault..
75 */ 76 */
76 if (in_atomic() || !mm) 77 if (pagefault_disabled() || !mm)
77 goto bad_area_nosemaphore; 78 goto bad_area_nosemaphore;
78 79
79 if (user_mode(regs)) 80 if (user_mode(regs))
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index a58fec9b55e0..79d8276377d1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -17,6 +17,7 @@
17#include <linux/kprobes.h> 17#include <linux/kprobes.h>
18#include <linux/perf_event.h> 18#include <linux/perf_event.h>
19#include <linux/kdebug.h> 19#include <linux/kdebug.h>
20#include <linux/uaccess.h>
20#include <asm/io_trapped.h> 21#include <asm/io_trapped.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
@@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
438 439
439 /* 440 /*
440 * If we're in an interrupt, have no user context or are running 441 * If we're in an interrupt, have no user context or are running
441 * in an atomic region then we must not take the fault: 442 * with pagefaults disabled then we must not take the fault:
442 */ 443 */
443 if (unlikely(in_atomic() || !mm)) { 444 if (unlikely(faulthandler_disabled() || !mm)) {
444 bad_area_nosemaphore(regs, error_code, address); 445 bad_area_nosemaphore(regs, error_code, address);
445 return; 446 return;
446 } 447 }
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index d1761df5cca6..01d17046225a 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -41,7 +41,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
41#define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id) 41#define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id)
42#define topology_core_id(cpu) (cpu_data(cpu).core_id) 42#define topology_core_id(cpu) (cpu_data(cpu).core_id)
43#define topology_core_cpumask(cpu) (&cpu_core_sib_map[cpu]) 43#define topology_core_cpumask(cpu) (&cpu_core_sib_map[cpu])
44#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) 44#define topology_sibling_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
45#endif /* CONFIG_SMP */ 45#endif /* CONFIG_SMP */
46 46
47extern cpumask_t cpu_core_map[NR_CPUS]; 47extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 70d817154fe8..c399e7b3b035 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -21,6 +21,7 @@
21#include <linux/perf_event.h> 21#include <linux/perf_event.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/kdebug.h> 23#include <linux/kdebug.h>
24#include <linux/uaccess.h>
24 25
25#include <asm/page.h> 26#include <asm/page.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -29,7 +30,6 @@
29#include <asm/setup.h> 30#include <asm/setup.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
31#include <asm/traps.h> 32#include <asm/traps.h>
32#include <asm/uaccess.h>
33 33
34#include "mm_32.h" 34#include "mm_32.h"
35 35
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
196 * If we're in an interrupt or have no user 196 * If we're in an interrupt or have no user
197 * context, we must not take the fault.. 197 * context, we must not take the fault..
198 */ 198 */
199 if (in_atomic() || !mm) 199 if (pagefault_disabled() || !mm)
200 goto no_context; 200 goto no_context;
201 201
202 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 202 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 479823249429..e9268ea1a68d 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -22,12 +22,12 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/percpu.h> 23#include <linux/percpu.h>
24#include <linux/context_tracking.h> 24#include <linux/context_tracking.h>
25#include <linux/uaccess.h>
25 26
26#include <asm/page.h> 27#include <asm/page.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/openprom.h> 29#include <asm/openprom.h>
29#include <asm/oplib.h> 30#include <asm/oplib.h>
30#include <asm/uaccess.h>
31#include <asm/asi.h> 31#include <asm/asi.h>
32#include <asm/lsu.h> 32#include <asm/lsu.h>
33#include <asm/sections.h> 33#include <asm/sections.h>
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
330 * If we're in an interrupt or have no user 330 * If we're in an interrupt or have no user
331 * context, we must not take the fault.. 331 * context, we must not take the fault..
332 */ 332 */
333 if (in_atomic() || !mm) 333 if (faulthandler_disabled() || !mm)
334 goto intr_or_no_mm; 334 goto intr_or_no_mm;
335 335
336 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 336 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 449f864f0cef..a454ec5ff07a 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page)
53 unsigned long vaddr; 53 unsigned long vaddr;
54 long idx, type; 54 long idx, type;
55 55
56 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 56 preempt_disable();
57 pagefault_disable(); 57 pagefault_disable();
58 if (!PageHighMem(page)) 58 if (!PageHighMem(page))
59 return page_address(page); 59 return page_address(page);
@@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr)
91 91
92 if (vaddr < FIXADDR_START) { // FIXME 92 if (vaddr < FIXADDR_START) { // FIXME
93 pagefault_enable(); 93 pagefault_enable();
94 preempt_enable();
94 return; 95 return;
95 } 96 }
96 97
@@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr)
126 127
127 kmap_atomic_idx_pop(); 128 kmap_atomic_idx_pop();
128 pagefault_enable(); 129 pagefault_enable();
130 preempt_enable();
129} 131}
130EXPORT_SYMBOL(__kunmap_atomic); 132EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 559cb744112c..c5d08b89a96c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2738,7 +2738,7 @@ void hugetlb_setup(struct pt_regs *regs)
2738 struct mm_struct *mm = current->mm; 2738 struct mm_struct *mm = current->mm;
2739 struct tsb_config *tp; 2739 struct tsb_config *tp;
2740 2740
2741 if (in_atomic() || !mm) { 2741 if (faulthandler_disabled() || !mm) {
2742 const struct exception_table_entry *entry; 2742 const struct exception_table_entry *entry;
2743 2743
2744 entry = search_exception_tables(regs->tpc); 2744 entry = search_exception_tables(regs->tpc);
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 938311844233..76b0d0ebb244 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -55,7 +55,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
55#define topology_physical_package_id(cpu) ((void)(cpu), 0) 55#define topology_physical_package_id(cpu) ((void)(cpu), 0)
56#define topology_core_id(cpu) (cpu) 56#define topology_core_id(cpu) (cpu)
57#define topology_core_cpumask(cpu) ((void)(cpu), cpu_online_mask) 57#define topology_core_cpumask(cpu) ((void)(cpu), cpu_online_mask)
58#define topology_thread_cpumask(cpu) cpumask_of(cpu) 58#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
59#endif 59#endif
60 60
61#endif /* _ASM_TILE_TOPOLOGY_H */ 61#endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index f41cb53cf645..a33276bf5ca1 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -78,7 +78,8 @@ int __range_ok(unsigned long addr, unsigned long size);
78 * @addr: User space pointer to start of block to check 78 * @addr: User space pointer to start of block to check
79 * @size: Size of block to check 79 * @size: Size of block to check
80 * 80 *
81 * Context: User context only. This function may sleep. 81 * Context: User context only. This function may sleep if pagefaults are
82 * enabled.
82 * 83 *
83 * Checks if a pointer to a block of memory in user space is valid. 84 * Checks if a pointer to a block of memory in user space is valid.
84 * 85 *
@@ -192,7 +193,8 @@ extern int __get_user_bad(void)
192 * @x: Variable to store result. 193 * @x: Variable to store result.
193 * @ptr: Source address, in user space. 194 * @ptr: Source address, in user space.
194 * 195 *
195 * Context: User context only. This function may sleep. 196 * Context: User context only. This function may sleep if pagefaults are
197 * enabled.
196 * 198 *
197 * This macro copies a single simple variable from user space to kernel 199 * This macro copies a single simple variable from user space to kernel
198 * space. It supports simple types like char and int, but not larger 200 * space. It supports simple types like char and int, but not larger
@@ -274,7 +276,8 @@ extern int __put_user_bad(void)
274 * @x: Value to copy to user space. 276 * @x: Value to copy to user space.
275 * @ptr: Destination address, in user space. 277 * @ptr: Destination address, in user space.
276 * 278 *
277 * Context: User context only. This function may sleep. 279 * Context: User context only. This function may sleep if pagefaults are
280 * enabled.
278 * 281 *
279 * This macro copies a single simple value from kernel space to user 282 * This macro copies a single simple value from kernel space to user
280 * space. It supports simple types like char and int, but not larger 283 * space. It supports simple types like char and int, but not larger
@@ -330,7 +333,8 @@ extern int __put_user_bad(void)
330 * @from: Source address, in kernel space. 333 * @from: Source address, in kernel space.
331 * @n: Number of bytes to copy. 334 * @n: Number of bytes to copy.
332 * 335 *
333 * Context: User context only. This function may sleep. 336 * Context: User context only. This function may sleep if pagefaults are
337 * enabled.
334 * 338 *
335 * Copy data from kernel space to user space. Caller must check 339 * Copy data from kernel space to user space. Caller must check
336 * the specified block with access_ok() before calling this function. 340 * the specified block with access_ok() before calling this function.
@@ -366,7 +370,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
366 * @from: Source address, in user space. 370 * @from: Source address, in user space.
367 * @n: Number of bytes to copy. 371 * @n: Number of bytes to copy.
368 * 372 *
369 * Context: User context only. This function may sleep. 373 * Context: User context only. This function may sleep if pagefaults are
374 * enabled.
370 * 375 *
371 * Copy data from user space to kernel space. Caller must check 376 * Copy data from user space to kernel space. Caller must check
372 * the specified block with access_ok() before calling this function. 377 * the specified block with access_ok() before calling this function.
@@ -437,7 +442,8 @@ static inline unsigned long __must_check copy_from_user(void *to,
437 * @from: Source address, in user space. 442 * @from: Source address, in user space.
438 * @n: Number of bytes to copy. 443 * @n: Number of bytes to copy.
439 * 444 *
440 * Context: User context only. This function may sleep. 445 * Context: User context only. This function may sleep if pagefaults are
446 * enabled.
441 * 447 *
442 * Copy data from user space to user space. Caller must check 448 * Copy data from user space to user space. Caller must check
443 * the specified blocks with access_ok() before calling this function. 449 * the specified blocks with access_ok() before calling this function.
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index e83cc999da02..3f4f58d34a92 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs,
354 354
355 /* 355 /*
356 * If we're in an interrupt, have no user context or are running in an 356 * If we're in an interrupt, have no user context or are running in an
357 * atomic region then we must not take the fault. 357 * region with pagefaults disabled then we must not take the fault.
358 */ 358 */
359 if (in_atomic() || !mm) { 359 if (pagefault_disabled() || !mm) {
360 vma = NULL; /* happy compiler */ 360 vma = NULL; /* happy compiler */
361 goto bad_area_nosemaphore; 361 goto bad_area_nosemaphore;
362 } 362 }
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 6aa2f2625447..fcd545014e79 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
201 int idx, type; 201 int idx, type;
202 pte_t *pte; 202 pte_t *pte;
203 203
204 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 204 preempt_disable();
205 pagefault_disable(); 205 pagefault_disable();
206 206
207 /* Avoid icache flushes by disallowing atomic executable mappings. */ 207 /* Avoid icache flushes by disallowing atomic executable mappings. */
@@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr)
259 } 259 }
260 260
261 pagefault_enable(); 261 pagefault_enable();
262 preempt_enable();
262} 263}
263EXPORT_SYMBOL(__kunmap_atomic); 264EXPORT_SYMBOL(__kunmap_atomic);
264 265
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 8e4daf44e980..47ff9b7f3e5d 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/hardirq.h> 8#include <linux/hardirq.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/uaccess.h>
10#include <asm/current.h> 11#include <asm/current.h>
11#include <asm/pgtable.h> 12#include <asm/pgtable.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
@@ -35,10 +36,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
35 *code_out = SEGV_MAPERR; 36 *code_out = SEGV_MAPERR;
36 37
37 /* 38 /*
38 * If the fault was during atomic operation, don't take the fault, just 39 * If the fault was with pagefaults disabled, don't take the fault, just
39 * fail. 40 * fail.
40 */ 41 */
41 if (in_atomic()) 42 if (faulthandler_disabled())
42 goto out_nosemaphore; 43 goto out_nosemaphore;
43 44
44 if (is_user) 45 if (is_user)
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 0dc922dba915..afccef5529cc 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
218 * If we're in an interrupt or have no user 218 * If we're in an interrupt or have no user
219 * context, we must not take the fault.. 219 * context, we must not take the fault..
220 */ 220 */
221 if (in_atomic() || !mm) 221 if (faulthandler_disabled() || !mm)
222 goto no_context; 222 goto no_context;
223 223
224 if (user_mode(regs)) 224 if (user_mode(regs))
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 17a8dced12da..222a6a3ca2b5 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
37DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); 37DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
38DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); 38DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
39 39
40static inline struct cpumask *cpu_sibling_mask(int cpu)
41{
42 return per_cpu(cpu_sibling_map, cpu);
43}
44
45static inline struct cpumask *cpu_core_mask(int cpu)
46{
47 return per_cpu(cpu_core_map, cpu);
48}
49
50static inline struct cpumask *cpu_llc_shared_mask(int cpu) 40static inline struct cpumask *cpu_llc_shared_mask(int cpu)
51{ 41{
52 return per_cpu(cpu_llc_shared_map, cpu); 42 return per_cpu(cpu_llc_shared_map, cpu);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f2c26f..5a77593fdace 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
124 124
125#ifdef ENABLE_TOPO_DEFINES 125#ifdef ENABLE_TOPO_DEFINES
126#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 126#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
127#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 127#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
128#endif 128#endif
129 129
130static inline void arch_fix_phys_package_id(int num, u32 slot) 130static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ace9dec050b1..a8df874f3e88 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
74 * @addr: User space pointer to start of block to check 74 * @addr: User space pointer to start of block to check
75 * @size: Size of block to check 75 * @size: Size of block to check
76 * 76 *
77 * Context: User context only. This function may sleep. 77 * Context: User context only. This function may sleep if pagefaults are
78 * enabled.
78 * 79 *
79 * Checks if a pointer to a block of memory in user space is valid. 80 * Checks if a pointer to a block of memory in user space is valid.
80 * 81 *
@@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
145 * @x: Variable to store result. 146 * @x: Variable to store result.
146 * @ptr: Source address, in user space. 147 * @ptr: Source address, in user space.
147 * 148 *
148 * Context: User context only. This function may sleep. 149 * Context: User context only. This function may sleep if pagefaults are
150 * enabled.
149 * 151 *
150 * This macro copies a single simple variable from user space to kernel 152 * This macro copies a single simple variable from user space to kernel
151 * space. It supports simple types like char and int, but not larger 153 * space. It supports simple types like char and int, but not larger
@@ -240,7 +242,8 @@ extern void __put_user_8(void);
240 * @x: Value to copy to user space. 242 * @x: Value to copy to user space.
241 * @ptr: Destination address, in user space. 243 * @ptr: Destination address, in user space.
242 * 244 *
243 * Context: User context only. This function may sleep. 245 * Context: User context only. This function may sleep if pagefaults are
246 * enabled.
244 * 247 *
245 * This macro copies a single simple value from kernel space to user 248 * This macro copies a single simple value from kernel space to user
246 * space. It supports simple types like char and int, but not larger 249 * space. It supports simple types like char and int, but not larger
@@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; };
455 * @x: Variable to store result. 458 * @x: Variable to store result.
456 * @ptr: Source address, in user space. 459 * @ptr: Source address, in user space.
457 * 460 *
458 * Context: User context only. This function may sleep. 461 * Context: User context only. This function may sleep if pagefaults are
462 * enabled.
459 * 463 *
460 * This macro copies a single simple variable from user space to kernel 464 * This macro copies a single simple variable from user space to kernel
461 * space. It supports simple types like char and int, but not larger 465 * space. It supports simple types like char and int, but not larger
@@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; };
479 * @x: Value to copy to user space. 483 * @x: Value to copy to user space.
480 * @ptr: Destination address, in user space. 484 * @ptr: Destination address, in user space.
481 * 485 *
482 * Context: User context only. This function may sleep. 486 * Context: User context only. This function may sleep if pagefaults are
487 * enabled.
483 * 488 *
484 * This macro copies a single simple value from kernel space to user 489 * This macro copies a single simple value from kernel space to user
485 * space. It supports simple types like char and int, but not larger 490 * space. It supports simple types like char and int, but not larger
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 3c03a5de64d3..7c8ad3451988 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -70,7 +70,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
70 * @from: Source address, in kernel space. 70 * @from: Source address, in kernel space.
71 * @n: Number of bytes to copy. 71 * @n: Number of bytes to copy.
72 * 72 *
73 * Context: User context only. This function may sleep. 73 * Context: User context only. This function may sleep if pagefaults are
74 * enabled.
74 * 75 *
75 * Copy data from kernel space to user space. Caller must check 76 * Copy data from kernel space to user space. Caller must check
76 * the specified block with access_ok() before calling this function. 77 * the specified block with access_ok() before calling this function.
@@ -117,7 +118,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
117 * @from: Source address, in user space. 118 * @from: Source address, in user space.
118 * @n: Number of bytes to copy. 119 * @n: Number of bytes to copy.
119 * 120 *
120 * Context: User context only. This function may sleep. 121 * Context: User context only. This function may sleep if pagefaults are
122 * enabled.
121 * 123 *
122 * Copy data from user space to kernel space. Caller must check 124 * Copy data from user space to kernel space. Caller must check
123 * the specified block with access_ok() before calling this function. 125 * the specified block with access_ok() before calling this function.
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3998131d1a68..324817735771 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2621,7 +2621,7 @@ static void intel_pmu_cpu_starting(int cpu)
2621 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { 2621 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
2622 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; 2622 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
2623 2623
2624 for_each_cpu(i, topology_thread_cpumask(cpu)) { 2624 for_each_cpu(i, topology_sibling_cpumask(cpu)) {
2625 struct intel_shared_regs *pc; 2625 struct intel_shared_regs *pc;
2626 2626
2627 pc = per_cpu(cpu_hw_events, i).shared_regs; 2627 pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2641,7 +2641,7 @@ static void intel_pmu_cpu_starting(int cpu)
2641 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { 2641 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
2642 int h = x86_pmu.num_counters >> 1; 2642 int h = x86_pmu.num_counters >> 1;
2643 2643
2644 for_each_cpu(i, topology_thread_cpumask(cpu)) { 2644 for_each_cpu(i, topology_sibling_cpumask(cpu)) {
2645 struct intel_excl_cntrs *c; 2645 struct intel_excl_cntrs *c;
2646 2646
2647 c = per_cpu(cpu_hw_events, i).excl_cntrs; 2647 c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -3403,7 +3403,7 @@ static __init int fixup_ht_bug(void)
3403 if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) 3403 if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
3404 return 0; 3404 return 0;
3405 3405
3406 w = cpumask_weight(topology_thread_cpumask(cpu)); 3406 w = cpumask_weight(topology_sibling_cpumask(cpu));
3407 if (w > 1) { 3407 if (w > 1) {
3408 pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); 3408 pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
3409 return 0; 3409 return 0;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e7d8c7608471..18ca99f2798b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
12{ 12{
13#ifdef CONFIG_SMP 13#ifdef CONFIG_SMP
14 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 14 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
15 seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); 15 seq_printf(m, "siblings\t: %d\n",
16 cpumask_weight(topology_core_cpumask(cpu)));
16 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 17 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
17 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 18 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
18 seq_printf(m, "apicid\t\t: %d\n", c->apicid); 19 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6e338e3b1dc0..c648139d68d7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -445,11 +445,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
445} 445}
446 446
447/* 447/*
448 * MONITOR/MWAIT with no hints, used for default default C1 state. 448 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
449 * This invokes MWAIT with interrutps enabled and no flags, 449 * with interrupts enabled and no flags, which is backwards compatible with the
450 * which is backwards compatible with the original MWAIT implementation. 450 * original MWAIT implementation.
451 */ 451 */
452
453static void mwait_idle(void) 452static void mwait_idle(void)
454{ 453{
455 if (!current_set_polling_and_test()) { 454 if (!current_set_polling_and_test()) {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547eac8cd..0e8209619455 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -314,10 +314,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
314 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); 314 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
315} 315}
316 316
317#define link_mask(_m, c1, c2) \ 317#define link_mask(mfunc, c1, c2) \
318do { \ 318do { \
319 cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ 319 cpumask_set_cpu((c1), mfunc(c2)); \
320 cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ 320 cpumask_set_cpu((c2), mfunc(c1)); \
321} while (0) 321} while (0)
322 322
323static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) 323static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -398,9 +398,9 @@ void set_cpu_sibling_map(int cpu)
398 cpumask_set_cpu(cpu, cpu_sibling_setup_mask); 398 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
399 399
400 if (!has_mp) { 400 if (!has_mp) {
401 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 401 cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
402 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); 402 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
403 cpumask_set_cpu(cpu, cpu_core_mask(cpu)); 403 cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
404 c->booted_cores = 1; 404 c->booted_cores = 1;
405 return; 405 return;
406 } 406 }
@@ -409,32 +409,34 @@ void set_cpu_sibling_map(int cpu)
409 o = &cpu_data(i); 409 o = &cpu_data(i);
410 410
411 if ((i == cpu) || (has_smt && match_smt(c, o))) 411 if ((i == cpu) || (has_smt && match_smt(c, o)))
412 link_mask(sibling, cpu, i); 412 link_mask(topology_sibling_cpumask, cpu, i);
413 413
414 if ((i == cpu) || (has_mp && match_llc(c, o))) 414 if ((i == cpu) || (has_mp && match_llc(c, o)))
415 link_mask(llc_shared, cpu, i); 415 link_mask(cpu_llc_shared_mask, cpu, i);
416 416
417 } 417 }
418 418
419 /* 419 /*
420 * This needs a separate iteration over the cpus because we rely on all 420 * This needs a separate iteration over the cpus because we rely on all
421 * cpu_sibling_mask links to be set-up. 421 * topology_sibling_cpumask links to be set-up.
422 */ 422 */
423 for_each_cpu(i, cpu_sibling_setup_mask) { 423 for_each_cpu(i, cpu_sibling_setup_mask) {
424 o = &cpu_data(i); 424 o = &cpu_data(i);
425 425
426 if ((i == cpu) || (has_mp && match_die(c, o))) { 426 if ((i == cpu) || (has_mp && match_die(c, o))) {
427 link_mask(core, cpu, i); 427 link_mask(topology_core_cpumask, cpu, i);
428 428
429 /* 429 /*
430 * Does this new cpu bringup a new core? 430 * Does this new cpu bringup a new core?
431 */ 431 */
432 if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) { 432 if (cpumask_weight(
433 topology_sibling_cpumask(cpu)) == 1) {
433 /* 434 /*
434 * for each core in package, increment 435 * for each core in package, increment
435 * the booted_cores for this new cpu 436 * the booted_cores for this new cpu
436 */ 437 */
437 if (cpumask_first(cpu_sibling_mask(i)) == i) 438 if (cpumask_first(
439 topology_sibling_cpumask(i)) == i)
438 c->booted_cores++; 440 c->booted_cores++;
439 /* 441 /*
440 * increment the core count for all 442 * increment the core count for all
@@ -1009,8 +1011,8 @@ static __init void disable_smp(void)
1009 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1011 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1010 else 1012 else
1011 physid_set_mask_of_physid(0, &phys_cpu_present_map); 1013 physid_set_mask_of_physid(0, &phys_cpu_present_map);
1012 cpumask_set_cpu(0, cpu_sibling_mask(0)); 1014 cpumask_set_cpu(0, topology_sibling_cpumask(0));
1013 cpumask_set_cpu(0, cpu_core_mask(0)); 1015 cpumask_set_cpu(0, topology_core_cpumask(0));
1014} 1016}
1015 1017
1016enum { 1018enum {
@@ -1293,22 +1295,22 @@ static void remove_siblinginfo(int cpu)
1293 int sibling; 1295 int sibling;
1294 struct cpuinfo_x86 *c = &cpu_data(cpu); 1296 struct cpuinfo_x86 *c = &cpu_data(cpu);
1295 1297
1296 for_each_cpu(sibling, cpu_core_mask(cpu)) { 1298 for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1297 cpumask_clear_cpu(cpu, cpu_core_mask(sibling)); 1299 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1298 /*/ 1300 /*/
1299 * last thread sibling in this cpu core going down 1301 * last thread sibling in this cpu core going down
1300 */ 1302 */
1301 if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) 1303 if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
1302 cpu_data(sibling).booted_cores--; 1304 cpu_data(sibling).booted_cores--;
1303 } 1305 }
1304 1306
1305 for_each_cpu(sibling, cpu_sibling_mask(cpu)) 1307 for_each_cpu(sibling, topology_sibling_cpumask(cpu))
1306 cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling)); 1308 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1307 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) 1309 for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1308 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); 1310 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
1309 cpumask_clear(cpu_llc_shared_mask(cpu)); 1311 cpumask_clear(cpu_llc_shared_mask(cpu));
1310 cpumask_clear(cpu_sibling_mask(cpu)); 1312 cpumask_clear(topology_sibling_cpumask(cpu));
1311 cpumask_clear(cpu_core_mask(cpu)); 1313 cpumask_clear(topology_core_cpumask(cpu));
1312 c->phys_proc_id = 0; 1314 c->phys_proc_id = 0;
1313 c->cpu_core_id = 0; 1315 c->cpu_core_id = 0;
1314 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); 1316 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 26488487bc61..dd8d0791dfb5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ static void check_tsc_warp(unsigned int timeout)
113 */ 113 */
114static inline unsigned int loop_timeout(int cpu) 114static inline unsigned int loop_timeout(int cpu)
115{ 115{
116 return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; 116 return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
117} 117}
118 118
119/* 119/*
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e2f5e21c03b3..91d93b95bd86 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
647 * @from: Source address, in kernel space. 647 * @from: Source address, in kernel space.
648 * @n: Number of bytes to copy. 648 * @n: Number of bytes to copy.
649 * 649 *
650 * Context: User context only. This function may sleep. 650 * Context: User context only. This function may sleep if pagefaults are
651 * enabled.
651 * 652 *
652 * Copy data from kernel space to user space. 653 * Copy data from kernel space to user space.
653 * 654 *
@@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user);
668 * @from: Source address, in user space. 669 * @from: Source address, in user space.
669 * @n: Number of bytes to copy. 670 * @n: Number of bytes to copy.
670 * 671 *
671 * Context: User context only. This function may sleep. 672 * Context: User context only. This function may sleep if pagefaults are
673 * enabled.
672 * 674 *
673 * Copy data from user space to kernel space. 675 * Copy data from user space to kernel space.
674 * 676 *
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 181c53bac3a7..9dc909841739 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
13#include <linux/hugetlb.h> /* hstate_index_to_shift */ 13#include <linux/hugetlb.h> /* hstate_index_to_shift */
14#include <linux/prefetch.h> /* prefetchw */ 14#include <linux/prefetch.h> /* prefetchw */
15#include <linux/context_tracking.h> /* exception_enter(), ... */ 15#include <linux/context_tracking.h> /* exception_enter(), ... */
16#include <linux/uaccess.h> /* faulthandler_disabled() */
16 17
17#include <asm/traps.h> /* dotraplinkage, ... */ 18#include <asm/traps.h> /* dotraplinkage, ... */
18#include <asm/pgalloc.h> /* pgd_*(), ... */ 19#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1126 1127
1127 /* 1128 /*
1128 * If we're in an interrupt, have no user context or are running 1129 * If we're in an interrupt, have no user context or are running
1129 * in an atomic region then we must not take the fault: 1130 * in a region with pagefaults disabled then we must not take the fault
1130 */ 1131 */
1131 if (unlikely(in_atomic() || !mm)) { 1132 if (unlikely(faulthandler_disabled() || !mm)) {
1132 bad_area_nosemaphore(regs, error_code, address); 1133 bad_area_nosemaphore(regs, error_code, address);
1133 return; 1134 return;
1134 } 1135 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 4500142bc4aa..eecb207a2037 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
35 unsigned long vaddr; 35 unsigned long vaddr;
36 int idx, type; 36 int idx, type;
37 37
38 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 38 preempt_disable();
39 pagefault_disable(); 39 pagefault_disable();
40 40
41 if (!PageHighMem(page)) 41 if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
100#endif 100#endif
101 101
102 pagefault_enable(); 102 pagefault_enable();
103 preempt_enable();
103} 104}
104EXPORT_SYMBOL(__kunmap_atomic); 105EXPORT_SYMBOL(__kunmap_atomic);
105 106
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9ca35fc60cfe..2b7ece0e103a 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
59 unsigned long vaddr; 59 unsigned long vaddr;
60 int idx, type; 60 int idx, type;
61 61
62 preempt_disable();
62 pagefault_disable(); 63 pagefault_disable();
63 64
64 type = kmap_atomic_idx_push(); 65 type = kmap_atomic_idx_push();
@@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr)
117 } 118 }
118 119
119 pagefault_enable(); 120 pagefault_enable();
121 preempt_enable();
120} 122}
121EXPORT_SYMBOL_GPL(iounmap_atomic); 123EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 9e3571a6535c..83a44a33cfa1 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -15,10 +15,10 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/hardirq.h> 17#include <linux/hardirq.h>
18#include <linux/uaccess.h>
18#include <asm/mmu_context.h> 19#include <asm/mmu_context.h>
19#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
20#include <asm/hardirq.h> 21#include <asm/hardirq.h>
21#include <asm/uaccess.h>
22#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
23 23
24DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST; 24DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
57 /* If we're in an interrupt or have no user 57 /* If we're in an interrupt or have no user
58 * context, we must not take the fault.. 58 * context, we must not take the fault..
59 */ 59 */
60 if (in_atomic() || !mm) { 60 if (faulthandler_disabled() || !mm) {
61 bad_page_fault(regs, address, SIGSEGV); 61 bad_page_fault(regs, address, SIGSEGV);
62 return; 62 return;
63 } 63 }
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c
index 8cfb71ec0937..184ceadccc1a 100644
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
42 enum fixed_addresses idx; 42 enum fixed_addresses idx;
43 unsigned long vaddr; 43 unsigned long vaddr;
44 44
45 preempt_disable();
45 pagefault_disable(); 46 pagefault_disable();
46 if (!PageHighMem(page)) 47 if (!PageHighMem(page))
47 return page_address(page); 48 return page_address(page);
@@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr)
79 } 80 }
80 81
81 pagefault_enable(); 82 pagefault_enable();
83 preempt_enable();
82} 84}
83EXPORT_SYMBOL(__kunmap_atomic); 85EXPORT_SYMBOL(__kunmap_atomic);
84 86
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0bcce..1e28ddb656b8 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
24{ 24{
25 unsigned int ret; 25 unsigned int ret;
26 26
27 ret = cpumask_first(topology_thread_cpumask(cpu)); 27 ret = cpumask_first(topology_sibling_cpumask(cpu));
28 if (ret < nr_cpu_ids) 28 if (ret < nr_cpu_ids)
29 return ret; 29 return ret;
30 30
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6bc9cbc01ad6..00b39802d7ec 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -105,7 +105,7 @@ static void round_robin_cpu(unsigned int tsk_index)
105 mutex_lock(&round_robin_lock); 105 mutex_lock(&round_robin_lock);
106 cpumask_clear(tmp); 106 cpumask_clear(tmp);
107 for_each_cpu(cpu, pad_busy_cpus) 107 for_each_cpu(cpu, pad_busy_cpus)
108 cpumask_or(tmp, tmp, topology_thread_cpumask(cpu)); 108 cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu));
109 cpumask_andnot(tmp, cpu_online_mask, tmp); 109 cpumask_andnot(tmp, cpu_online_mask, tmp);
110 /* avoid HT sibilings if possible */ 110 /* avoid HT sibilings if possible */
111 if (cpumask_empty(tmp)) 111 if (cpumask_empty(tmp))
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 6491f45200a7..8b7d7f8e5851 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -61,7 +61,7 @@ static DEVICE_ATTR_RO(physical_package_id);
61define_id_show_func(core_id); 61define_id_show_func(core_id);
62static DEVICE_ATTR_RO(core_id); 62static DEVICE_ATTR_RO(core_id);
63 63
64define_siblings_show_func(thread_siblings, thread_cpumask); 64define_siblings_show_func(thread_siblings, sibling_cpumask);
65static DEVICE_ATTR_RO(thread_siblings); 65static DEVICE_ATTR_RO(thread_siblings);
66static DEVICE_ATTR_RO(thread_siblings_list); 66static DEVICE_ATTR_RO(thread_siblings_list);
67 67
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index b0c18ed8d83f..0136dfcdabf0 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -699,13 +699,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
699 dmi_check_system(sw_any_bug_dmi_table); 699 dmi_check_system(sw_any_bug_dmi_table);
700 if (bios_with_sw_any_bug && !policy_is_shared(policy)) { 700 if (bios_with_sw_any_bug && !policy_is_shared(policy)) {
701 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; 701 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
702 cpumask_copy(policy->cpus, cpu_core_mask(cpu)); 702 cpumask_copy(policy->cpus, topology_core_cpumask(cpu));
703 } 703 }
704 704
705 if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) { 705 if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) {
706 cpumask_clear(policy->cpus); 706 cpumask_clear(policy->cpus);
707 cpumask_set_cpu(cpu, policy->cpus); 707 cpumask_set_cpu(cpu, policy->cpus);
708 cpumask_copy(data->freqdomain_cpus, cpu_sibling_mask(cpu)); 708 cpumask_copy(data->freqdomain_cpus,
709 topology_sibling_cpumask(cpu));
709 policy->shared_type = CPUFREQ_SHARED_TYPE_HW; 710 policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
710 pr_info_once(PFX "overriding BIOS provided _PSD data\n"); 711 pr_info_once(PFX "overriding BIOS provided _PSD data\n");
711 } 712 }
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
index 529cfd92158f..5dd95dab580d 100644
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -172,7 +172,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
172 unsigned int i; 172 unsigned int i;
173 173
174#ifdef CONFIG_SMP 174#ifdef CONFIG_SMP
175 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 175 cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
176#endif 176#endif
177 177
178 /* Errata workaround */ 178 /* Errata workaround */
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
index f9ce7e4bf0fe..5c035d04d827 100644
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -57,13 +57,6 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
57 57
58static struct cpufreq_driver cpufreq_amd64_driver; 58static struct cpufreq_driver cpufreq_amd64_driver;
59 59
60#ifndef CONFIG_SMP
61static inline const struct cpumask *cpu_core_mask(int cpu)
62{
63 return cpumask_of(0);
64}
65#endif
66
67/* Return a frequency in MHz, given an input fid */ 60/* Return a frequency in MHz, given an input fid */
68static u32 find_freq_from_fid(u32 fid) 61static u32 find_freq_from_fid(u32 fid)
69{ 62{
@@ -620,7 +613,7 @@ static int fill_powernow_table(struct powernow_k8_data *data,
620 613
621 pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); 614 pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
622 data->powernow_table = powernow_table; 615 data->powernow_table = powernow_table;
623 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 616 if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
624 print_basics(data); 617 print_basics(data);
625 618
626 for (j = 0; j < data->numps; j++) 619 for (j = 0; j < data->numps; j++)
@@ -784,7 +777,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
784 CPUFREQ_TABLE_END; 777 CPUFREQ_TABLE_END;
785 data->powernow_table = powernow_table; 778 data->powernow_table = powernow_table;
786 779
787 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 780 if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
788 print_basics(data); 781 print_basics(data);
789 782
790 /* notify BIOS that we exist */ 783 /* notify BIOS that we exist */
@@ -1090,7 +1083,7 @@ static int powernowk8_cpu_init(struct cpufreq_policy *pol)
1090 if (rc != 0) 1083 if (rc != 0)
1091 goto err_out_exit_acpi; 1084 goto err_out_exit_acpi;
1092 1085
1093 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); 1086 cpumask_copy(pol->cpus, topology_core_cpumask(pol->cpu));
1094 data->available_cores = pol->cpus; 1087 data->available_cores = pol->cpus;
1095 1088
1096 /* min/max the cpu is capable of */ 1089 /* min/max the cpu is capable of */
diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c
index e56d632a8b21..37555c6b86a7 100644
--- a/drivers/cpufreq/speedstep-ich.c
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
292 292
293 /* only run on CPU to be set, or on its sibling */ 293 /* only run on CPU to be set, or on its sibling */
294#ifdef CONFIG_SMP 294#ifdef CONFIG_SMP
295 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 295 cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
296#endif 296#endif
297 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); 297 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
298 298
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index ab300ea19434..a9064e36e7b5 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -78,12 +78,14 @@ static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
78 int ret; 78 int ret;
79 struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm); 79 struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
80 80
81 preempt_disable();
81 pagefault_disable(); 82 pagefault_disable();
82 enable_kernel_altivec(); 83 enable_kernel_altivec();
83 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); 84 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
84 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key); 85 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
85 pagefault_enable(); 86 pagefault_enable();
86 87 preempt_enable();
88
87 ret += crypto_cipher_setkey(ctx->fallback, key, keylen); 89 ret += crypto_cipher_setkey(ctx->fallback, key, keylen);
88 return ret; 90 return ret;
89} 91}
@@ -95,10 +97,12 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
95 if (in_interrupt()) { 97 if (in_interrupt()) {
96 crypto_cipher_encrypt_one(ctx->fallback, dst, src); 98 crypto_cipher_encrypt_one(ctx->fallback, dst, src);
97 } else { 99 } else {
100 preempt_disable();
98 pagefault_disable(); 101 pagefault_disable();
99 enable_kernel_altivec(); 102 enable_kernel_altivec();
100 aes_p8_encrypt(src, dst, &ctx->enc_key); 103 aes_p8_encrypt(src, dst, &ctx->enc_key);
101 pagefault_enable(); 104 pagefault_enable();
105 preempt_enable();
102 } 106 }
103} 107}
104 108
@@ -109,10 +113,12 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
109 if (in_interrupt()) { 113 if (in_interrupt()) {
110 crypto_cipher_decrypt_one(ctx->fallback, dst, src); 114 crypto_cipher_decrypt_one(ctx->fallback, dst, src);
111 } else { 115 } else {
116 preempt_disable();
112 pagefault_disable(); 117 pagefault_disable();
113 enable_kernel_altivec(); 118 enable_kernel_altivec();
114 aes_p8_decrypt(src, dst, &ctx->dec_key); 119 aes_p8_decrypt(src, dst, &ctx->dec_key);
115 pagefault_enable(); 120 pagefault_enable();
121 preempt_enable();
116 } 122 }
117} 123}
118 124
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index 1a559b7dddb5..477284abdd11 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -79,11 +79,13 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
79 int ret; 79 int ret;
80 struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); 80 struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
81 81
82 preempt_disable();
82 pagefault_disable(); 83 pagefault_disable();
83 enable_kernel_altivec(); 84 enable_kernel_altivec();
84 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); 85 ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
85 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key); 86 ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
86 pagefault_enable(); 87 pagefault_enable();
88 preempt_enable();
87 89
88 ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen); 90 ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen);
89 return ret; 91 return ret;
@@ -106,6 +108,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
106 if (in_interrupt()) { 108 if (in_interrupt()) {
107 ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes); 109 ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes);
108 } else { 110 } else {
111 preempt_disable();
109 pagefault_disable(); 112 pagefault_disable();
110 enable_kernel_altivec(); 113 enable_kernel_altivec();
111 114
@@ -119,6 +122,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
119 } 122 }
120 123
121 pagefault_enable(); 124 pagefault_enable();
125 preempt_enable();
122 } 126 }
123 127
124 return ret; 128 return ret;
@@ -141,6 +145,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
141 if (in_interrupt()) { 145 if (in_interrupt()) {
142 ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes); 146 ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes);
143 } else { 147 } else {
148 preempt_disable();
144 pagefault_disable(); 149 pagefault_disable();
145 enable_kernel_altivec(); 150 enable_kernel_altivec();
146 151
@@ -154,6 +159,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
154 } 159 }
155 160
156 pagefault_enable(); 161 pagefault_enable();
162 preempt_enable();
157 } 163 }
158 164
159 return ret; 165 return ret;
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index d0ffe277af5c..f255ec4a04d4 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -114,11 +114,13 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
114 if (keylen != GHASH_KEY_LEN) 114 if (keylen != GHASH_KEY_LEN)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 preempt_disable();
117 pagefault_disable(); 118 pagefault_disable();
118 enable_kernel_altivec(); 119 enable_kernel_altivec();
119 enable_kernel_fp(); 120 enable_kernel_fp();
120 gcm_init_p8(ctx->htable, (const u64 *) key); 121 gcm_init_p8(ctx->htable, (const u64 *) key);
121 pagefault_enable(); 122 pagefault_enable();
123 preempt_enable();
122 return crypto_shash_setkey(ctx->fallback, key, keylen); 124 return crypto_shash_setkey(ctx->fallback, key, keylen);
123} 125}
124 126
@@ -140,23 +142,27 @@ static int p8_ghash_update(struct shash_desc *desc,
140 } 142 }
141 memcpy(dctx->buffer + dctx->bytes, src, 143 memcpy(dctx->buffer + dctx->bytes, src,
142 GHASH_DIGEST_SIZE - dctx->bytes); 144 GHASH_DIGEST_SIZE - dctx->bytes);
145 preempt_disable();
143 pagefault_disable(); 146 pagefault_disable();
144 enable_kernel_altivec(); 147 enable_kernel_altivec();
145 enable_kernel_fp(); 148 enable_kernel_fp();
146 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer, 149 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
147 GHASH_DIGEST_SIZE); 150 GHASH_DIGEST_SIZE);
148 pagefault_enable(); 151 pagefault_enable();
152 preempt_enable();
149 src += GHASH_DIGEST_SIZE - dctx->bytes; 153 src += GHASH_DIGEST_SIZE - dctx->bytes;
150 srclen -= GHASH_DIGEST_SIZE - dctx->bytes; 154 srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
151 dctx->bytes = 0; 155 dctx->bytes = 0;
152 } 156 }
153 len = srclen & ~(GHASH_DIGEST_SIZE - 1); 157 len = srclen & ~(GHASH_DIGEST_SIZE - 1);
154 if (len) { 158 if (len) {
159 preempt_disable();
155 pagefault_disable(); 160 pagefault_disable();
156 enable_kernel_altivec(); 161 enable_kernel_altivec();
157 enable_kernel_fp(); 162 enable_kernel_fp();
158 gcm_ghash_p8(dctx->shash, ctx->htable, src, len); 163 gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
159 pagefault_enable(); 164 pagefault_enable();
165 preempt_enable();
160 src += len; 166 src += len;
161 srclen -= len; 167 srclen -= len;
162 } 168 }
@@ -180,12 +186,14 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
180 if (dctx->bytes) { 186 if (dctx->bytes) {
181 for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++) 187 for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
182 dctx->buffer[i] = 0; 188 dctx->buffer[i] = 0;
189 preempt_disable();
183 pagefault_disable(); 190 pagefault_disable();
184 enable_kernel_altivec(); 191 enable_kernel_altivec();
185 enable_kernel_fp(); 192 enable_kernel_fp();
186 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer, 193 gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
187 GHASH_DIGEST_SIZE); 194 GHASH_DIGEST_SIZE);
188 pagefault_enable(); 195 pagefault_enable();
196 preempt_enable();
189 dctx->bytes = 0; 197 dctx->bytes = 0;
190 } 198 }
191 memcpy(out, dctx->shash, GHASH_DIGEST_SIZE); 199 memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index a3190e793ed4..cc552a4c1f3b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
32#include "i915_trace.h" 32#include "i915_trace.h"
33#include "intel_drv.h" 33#include "intel_drv.h"
34#include <linux/dma_remapping.h> 34#include <linux/dma_remapping.h>
35#include <linux/uaccess.h>
35 36
36#define __EXEC_OBJECT_HAS_PIN (1<<31) 37#define __EXEC_OBJECT_HAS_PIN (1<<31)
37#define __EXEC_OBJECT_HAS_FENCE (1<<30) 38#define __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -465,7 +466,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
465 } 466 }
466 467
467 /* We can't wait for rendering with pagefaults disabled */ 468 /* We can't wait for rendering with pagefaults disabled */
468 if (obj->active && in_atomic()) 469 if (obj->active && pagefault_disabled())
469 return -EFAULT; 470 return -EFAULT;
470 471
471 if (use_cpu_reloc(obj)) 472 if (use_cpu_reloc(obj))
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ed303ba3a593..3e03379e7c5d 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -63,7 +63,8 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
63#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) 63#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66#define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) 66#define for_each_sibling(i, cpu) \
67 for_each_cpu(i, topology_sibling_cpumask(cpu))
67#else 68#else
68#define for_each_sibling(i, cpu) for (i = 0; false; ) 69#define for_each_sibling(i, cpu) for (i = 0; false; )
69#endif 70#endif
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 4b00545a3ace..65944dd8bf6b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1304,7 +1304,7 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
1304 if (!cpumask_test_cpu(cpu, thread_mask)) { 1304 if (!cpumask_test_cpu(cpu, thread_mask)) {
1305 ++count; 1305 ++count;
1306 cpumask_or(thread_mask, thread_mask, 1306 cpumask_or(thread_mask, thread_mask,
1307 topology_thread_cpumask(cpu)); 1307 topology_sibling_cpumask(cpu));
1308 } 1308 }
1309 } 1309 }
1310 1310
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
index cc3ab351943e..f9262243f935 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -87,7 +87,7 @@ static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
87/* return cpumask of HTs in the same core */ 87/* return cpumask of HTs in the same core */
88static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask) 88static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
89{ 89{
90 cpumask_copy(mask, topology_thread_cpumask(cpu)); 90 cpumask_copy(mask, topology_sibling_cpumask(cpu));
91} 91}
92 92
93static void cfs_node_to_cpumask(int node, cpumask_t *mask) 93static void cfs_node_to_cpumask(int node, cpumask_t *mask)
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 8e61421515cb..344189ac5698 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -557,7 +557,7 @@ ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
557 * there are. 557 * there are.
558 */ 558 */
559 /* weight is # of HTs */ 559 /* weight is # of HTs */
560 if (cpumask_weight(topology_thread_cpumask(0)) > 1) { 560 if (cpumask_weight(topology_sibling_cpumask(0)) > 1) {
561 /* depress thread factor for hyper-thread */ 561 /* depress thread factor for hyper-thread */
562 factor = factor - (factor >> 1) + (factor >> 3); 562 factor = factor - (factor >> 1) + (factor >> 3);
563 } 563 }
@@ -2768,7 +2768,7 @@ int ptlrpc_hr_init(void)
2768 2768
2769 init_waitqueue_head(&ptlrpc_hr.hr_waitq); 2769 init_waitqueue_head(&ptlrpc_hr.hr_waitq);
2770 2770
2771 weight = cpumask_weight(topology_thread_cpumask(0)); 2771 weight = cpumask_weight(topology_sibling_cpumask(0));
2772 2772
2773 cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { 2773 cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
2774 hrp->hrp_cpt = i; 2774 hrp->hrp_cpt = i;
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index b59b5a52637e..e56272c919b5 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -8,8 +8,7 @@
8#ifndef CONFIG_SMP 8#ifndef CONFIG_SMP
9/* 9/*
10 * The following implementation only for uniprocessor machines. 10 * The following implementation only for uniprocessor machines.
11 * For UP, it's relies on the fact that pagefault_disable() also disables 11 * It relies on preempt_disable() ensuring mutual exclusion.
12 * preemption to ensure mutual exclusion.
13 * 12 *
14 */ 13 */
15 14
@@ -38,6 +37,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
38 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 37 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
39 oparg = 1 << oparg; 38 oparg = 1 << oparg;
40 39
40 preempt_disable();
41 pagefault_disable(); 41 pagefault_disable();
42 42
43 ret = -EFAULT; 43 ret = -EFAULT;
@@ -72,6 +72,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
72 72
73out_pagefault_enable: 73out_pagefault_enable:
74 pagefault_enable(); 74 pagefault_enable();
75 preempt_enable();
75 76
76 if (ret == 0) { 77 if (ret == 0) {
77 switch (cmp) { 78 switch (cmp) {
@@ -106,6 +107,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
106{ 107{
107 u32 val; 108 u32 val;
108 109
110 preempt_disable();
109 if (unlikely(get_user(val, uaddr) != 0)) 111 if (unlikely(get_user(val, uaddr) != 0))
110 return -EFAULT; 112 return -EFAULT;
111 113
@@ -113,6 +115,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 return -EFAULT; 115 return -EFAULT;
114 116
115 *uval = val; 117 *uval = val;
118 preempt_enable();
116 119
117 return 0; 120 return 0;
118} 121}
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 86c12c93e3cf..8fdcb783197d 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
2#define _LINUX_BH_H 2#define _LINUX_BH_H
3 3
4#include <linux/preempt.h> 4#include <linux/preempt.h>
5#include <linux/preempt_mask.h>
6 5
7#ifdef CONFIG_TRACE_IRQFLAGS 6#ifdef CONFIG_TRACE_IRQFLAGS
8extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); 7extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b97..dfd59d6bc6f0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,7 @@
1#ifndef LINUX_HARDIRQ_H 1#ifndef LINUX_HARDIRQ_H
2#define LINUX_HARDIRQ_H 2#define LINUX_HARDIRQ_H
3 3
4#include <linux/preempt_mask.h> 4#include <linux/preempt.h>
5#include <linux/lockdep.h> 5#include <linux/lockdep.h>
6#include <linux/ftrace_irq.h> 6#include <linux/ftrace_irq.h>
7#include <linux/vtime.h> 7#include <linux/vtime.h>
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9286a46b7d69..6aefcd0031a6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -65,6 +65,7 @@ static inline void kunmap(struct page *page)
65 65
66static inline void *kmap_atomic(struct page *page) 66static inline void *kmap_atomic(struct page *page)
67{ 67{
68 preempt_disable();
68 pagefault_disable(); 69 pagefault_disable();
69 return page_address(page); 70 return page_address(page);
70} 71}
@@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page)
73static inline void __kunmap_atomic(void *addr) 74static inline void __kunmap_atomic(void *addr)
74{ 75{
75 pagefault_enable(); 76 pagefault_enable();
77 preempt_enable();
76} 78}
77 79
78#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) 80#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b31..bb9b075f0eb0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,8 @@ extern struct fs_struct init_fs;
50 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ 50 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
51 .rlim = INIT_RLIMITS, \ 51 .rlim = INIT_RLIMITS, \
52 .cputimer = { \ 52 .cputimer = { \
53 .cputime = INIT_CPUTIME, \ 53 .cputime_atomic = INIT_CPUTIME_ATOMIC, \
54 .running = 0, \ 54 .running = 0, \
55 .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
56 }, \ 55 }, \
57 .cred_guard_mutex = \ 56 .cred_guard_mutex = \
58 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ 57 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 657fab4efab3..c27dde7215b5 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -141,6 +141,7 @@ static inline void __iomem *
141io_mapping_map_atomic_wc(struct io_mapping *mapping, 141io_mapping_map_atomic_wc(struct io_mapping *mapping,
142 unsigned long offset) 142 unsigned long offset)
143{ 143{
144 preempt_disable();
144 pagefault_disable(); 145 pagefault_disable();
145 return ((char __force __iomem *) mapping) + offset; 146 return ((char __force __iomem *) mapping) + offset;
146} 147}
@@ -149,6 +150,7 @@ static inline void
149io_mapping_unmap_atomic(void __iomem *vaddr) 150io_mapping_unmap_atomic(void __iomem *vaddr)
150{ 151{
151 pagefault_enable(); 152 pagefault_enable();
153 preempt_enable();
152} 154}
153 155
154/* Non-atomic map/unmap */ 156/* Non-atomic map/unmap */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3a5b48e52a9e..060dd7b61c6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
244 244
245#if defined(CONFIG_MMU) && \ 245#if defined(CONFIG_MMU) && \
246 (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)) 246 (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
247void might_fault(void); 247#define might_fault() __might_fault(__FILE__, __LINE__)
248void __might_fault(const char *file, int line);
248#else 249#else
249static inline void might_fault(void) { } 250static inline void might_fault(void) { }
250#endif 251#endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4eb1642..a1a00e14c14f 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,13 +10,117 @@
10#include <linux/list.h> 10#include <linux/list.h>
11 11
12/* 12/*
13 * We use the MSB mostly because its available; see <linux/preempt_mask.h> for 13 * We put the hardirq and softirq counter into the preemption
14 * the other bits -- can't include that header due to inclusion hell. 14 * counter. The bitmask has the following meaning:
15 *
16 * - bits 0-7 are the preemption count (max preemption depth: 256)
17 * - bits 8-15 are the softirq count (max # of softirqs: 256)
18 *
19 * The hardirq count could in theory be the same as the number of
20 * interrupts in the system, but we run all interrupt handlers with
21 * interrupts disabled, so we cannot have nesting interrupts. Though
22 * there are a few palaeontologic drivers which reenable interrupts in
23 * the handler, so we need more than one bit here.
24 *
25 * PREEMPT_MASK: 0x000000ff
26 * SOFTIRQ_MASK: 0x0000ff00
27 * HARDIRQ_MASK: 0x000f0000
28 * NMI_MASK: 0x00100000
29 * PREEMPT_ACTIVE: 0x00200000
30 * PREEMPT_NEED_RESCHED: 0x80000000
15 */ 31 */
32#define PREEMPT_BITS 8
33#define SOFTIRQ_BITS 8
34#define HARDIRQ_BITS 4
35#define NMI_BITS 1
36
37#define PREEMPT_SHIFT 0
38#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
39#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
40#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
41
42#define __IRQ_MASK(x) ((1UL << (x))-1)
43
44#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
45#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
46#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
47#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
48
49#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
50#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
51#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
52#define NMI_OFFSET (1UL << NMI_SHIFT)
53
54#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
55
56#define PREEMPT_ACTIVE_BITS 1
57#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
58#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
59
60/* We use the MSB mostly because its available */
16#define PREEMPT_NEED_RESCHED 0x80000000 61#define PREEMPT_NEED_RESCHED 0x80000000
17 62
63/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
18#include <asm/preempt.h> 64#include <asm/preempt.h>
19 65
66#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
67#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
68#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
69 | NMI_MASK))
70
71/*
72 * Are we doing bottom half or hardware interrupt processing?
73 * Are we in a softirq context? Interrupt context?
74 * in_softirq - Are we currently processing softirq or have bh disabled?
75 * in_serving_softirq - Are we currently processing softirq?
76 */
77#define in_irq() (hardirq_count())
78#define in_softirq() (softirq_count())
79#define in_interrupt() (irq_count())
80#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
81
82/*
83 * Are we in NMI context?
84 */
85#define in_nmi() (preempt_count() & NMI_MASK)
86
87#if defined(CONFIG_PREEMPT_COUNT)
88# define PREEMPT_DISABLE_OFFSET 1
89#else
90# define PREEMPT_DISABLE_OFFSET 0
91#endif
92
93/*
94 * The preempt_count offset needed for things like:
95 *
96 * spin_lock_bh()
97 *
98 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
99 * softirqs, such that unlock sequences of:
100 *
101 * spin_unlock();
102 * local_bh_enable();
103 *
104 * Work as expected.
105 */
106#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
107
108/*
109 * Are we running in atomic context? WARNING: this macro cannot
110 * always detect atomic context; in particular, it cannot know about
111 * held spinlocks in non-preemptible kernels. Thus it should not be
112 * used in the general case to determine whether sleeping is possible.
113 * Do not use in_atomic() in driver code.
114 */
115#define in_atomic() (preempt_count() != 0)
116
117/*
118 * Check whether we were atomic before we did preempt_disable():
119 * (used by the scheduler)
120 */
121#define in_atomic_preempt_off() \
122 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
123
20#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) 124#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
21extern void preempt_count_add(int val); 125extern void preempt_count_add(int val);
22extern void preempt_count_sub(int val); 126extern void preempt_count_sub(int val);
@@ -33,6 +137,18 @@ extern void preempt_count_sub(int val);
33#define preempt_count_inc() preempt_count_add(1) 137#define preempt_count_inc() preempt_count_add(1)
34#define preempt_count_dec() preempt_count_sub(1) 138#define preempt_count_dec() preempt_count_sub(1)
35 139
140#define preempt_active_enter() \
141do { \
142 preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
143 barrier(); \
144} while (0)
145
146#define preempt_active_exit() \
147do { \
148 barrier(); \
149 preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
150} while (0)
151
36#ifdef CONFIG_PREEMPT_COUNT 152#ifdef CONFIG_PREEMPT_COUNT
37 153
38#define preempt_disable() \ 154#define preempt_disable() \
@@ -49,6 +165,8 @@ do { \
49 165
50#define preempt_enable_no_resched() sched_preempt_enable_no_resched() 166#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
51 167
168#define preemptible() (preempt_count() == 0 && !irqs_disabled())
169
52#ifdef CONFIG_PREEMPT 170#ifdef CONFIG_PREEMPT
53#define preempt_enable() \ 171#define preempt_enable() \
54do { \ 172do { \
@@ -121,6 +239,7 @@ do { \
121#define preempt_disable_notrace() barrier() 239#define preempt_disable_notrace() barrier()
122#define preempt_enable_no_resched_notrace() barrier() 240#define preempt_enable_no_resched_notrace() barrier()
123#define preempt_enable_notrace() barrier() 241#define preempt_enable_notrace() barrier()
242#define preemptible() 0
124 243
125#endif /* CONFIG_PREEMPT_COUNT */ 244#endif /* CONFIG_PREEMPT_COUNT */
126 245
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
deleted file mode 100644
index dbeec4d4a3be..000000000000
--- a/include/linux/preempt_mask.h
+++ /dev/null
@@ -1,117 +0,0 @@
1#ifndef LINUX_PREEMPT_MASK_H
2#define LINUX_PREEMPT_MASK_H
3
4#include <linux/preempt.h>
5
6/*
7 * We put the hardirq and softirq counter into the preemption
8 * counter. The bitmask has the following meaning:
9 *
10 * - bits 0-7 are the preemption count (max preemption depth: 256)
11 * - bits 8-15 are the softirq count (max # of softirqs: 256)
12 *
13 * The hardirq count could in theory be the same as the number of
14 * interrupts in the system, but we run all interrupt handlers with
15 * interrupts disabled, so we cannot have nesting interrupts. Though
16 * there are a few palaeontologic drivers which reenable interrupts in
17 * the handler, so we need more than one bit here.
18 *
19 * PREEMPT_MASK: 0x000000ff
20 * SOFTIRQ_MASK: 0x0000ff00
21 * HARDIRQ_MASK: 0x000f0000
22 * NMI_MASK: 0x00100000
23 * PREEMPT_ACTIVE: 0x00200000
24 */
25#define PREEMPT_BITS 8
26#define SOFTIRQ_BITS 8
27#define HARDIRQ_BITS 4
28#define NMI_BITS 1
29
30#define PREEMPT_SHIFT 0
31#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
32#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
33#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
34
35#define __IRQ_MASK(x) ((1UL << (x))-1)
36
37#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
38#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
39#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
40#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
41
42#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
43#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
44#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
45#define NMI_OFFSET (1UL << NMI_SHIFT)
46
47#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
48
49#define PREEMPT_ACTIVE_BITS 1
50#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
51#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
52
53#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
54#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
55#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
56 | NMI_MASK))
57
58/*
59 * Are we doing bottom half or hardware interrupt processing?
60 * Are we in a softirq context? Interrupt context?
61 * in_softirq - Are we currently processing softirq or have bh disabled?
62 * in_serving_softirq - Are we currently processing softirq?
63 */
64#define in_irq() (hardirq_count())
65#define in_softirq() (softirq_count())
66#define in_interrupt() (irq_count())
67#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
68
69/*
70 * Are we in NMI context?
71 */
72#define in_nmi() (preempt_count() & NMI_MASK)
73
74#if defined(CONFIG_PREEMPT_COUNT)
75# define PREEMPT_CHECK_OFFSET 1
76#else
77# define PREEMPT_CHECK_OFFSET 0
78#endif
79
80/*
81 * The preempt_count offset needed for things like:
82 *
83 * spin_lock_bh()
84 *
85 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
86 * softirqs, such that unlock sequences of:
87 *
88 * spin_unlock();
89 * local_bh_enable();
90 *
91 * Work as expected.
92 */
93#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
94
95/*
96 * Are we running in atomic context? WARNING: this macro cannot
97 * always detect atomic context; in particular, it cannot know about
98 * held spinlocks in non-preemptible kernels. Thus it should not be
99 * used in the general case to determine whether sleeping is possible.
100 * Do not use in_atomic() in driver code.
101 */
102#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
103
104/*
105 * Check whether we were atomic before we did preempt_disable():
106 * (used by the scheduler, *after* releasing the kernel lock)
107 */
108#define in_atomic_preempt_off() \
109 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
110
111#ifdef CONFIG_PREEMPT_COUNT
112# define preemptible() (preempt_count() == 0 && !irqs_disabled())
113#else
114# define preemptible() 0
115#endif
116
117#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..7de815c6fa78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,7 +25,7 @@ struct sched_param {
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/nodemask.h> 26#include <linux/nodemask.h>
27#include <linux/mm_types.h> 27#include <linux/mm_types.h>
28#include <linux/preempt_mask.h> 28#include <linux/preempt.h>
29 29
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/ptrace.h> 31#include <asm/ptrace.h>
@@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
173extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); 173extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
174 174
175extern void calc_global_load(unsigned long ticks); 175extern void calc_global_load(unsigned long ticks);
176
177#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
176extern void update_cpu_load_nohz(void); 178extern void update_cpu_load_nohz(void);
179#else
180static inline void update_cpu_load_nohz(void) { }
181#endif
177 182
178extern unsigned long get_parent_ip(unsigned long addr); 183extern unsigned long get_parent_ip(unsigned long addr);
179 184
@@ -213,9 +218,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
213#define TASK_WAKEKILL 128 218#define TASK_WAKEKILL 128
214#define TASK_WAKING 256 219#define TASK_WAKING 256
215#define TASK_PARKED 512 220#define TASK_PARKED 512
216#define TASK_STATE_MAX 1024 221#define TASK_NOLOAD 1024
222#define TASK_STATE_MAX 2048
217 223
218#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP" 224#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
219 225
220extern char ___assert_task_state[1 - 2*!!( 226extern char ___assert_task_state[1 - 2*!!(
221 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; 227 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -225,6 +231,8 @@ extern char ___assert_task_state[1 - 2*!!(
225#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) 231#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
226#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) 232#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
227 233
234#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
235
228/* Convenience macros for the sake of wake_up */ 236/* Convenience macros for the sake of wake_up */
229#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) 237#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
230#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) 238#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@ -240,7 +248,8 @@ extern char ___assert_task_state[1 - 2*!!(
240 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 248 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
241#define task_contributes_to_load(task) \ 249#define task_contributes_to_load(task) \
242 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ 250 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
243 (task->flags & PF_FROZEN) == 0) 251 (task->flags & PF_FROZEN) == 0 && \
252 (task->state & TASK_NOLOAD) == 0)
244 253
245#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 254#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
246 255
@@ -567,6 +576,23 @@ struct task_cputime {
567 .sum_exec_runtime = 0, \ 576 .sum_exec_runtime = 0, \
568 } 577 }
569 578
579/*
580 * This is the atomic variant of task_cputime, which can be used for
581 * storing and updating task_cputime statistics without locking.
582 */
583struct task_cputime_atomic {
584 atomic64_t utime;
585 atomic64_t stime;
586 atomic64_t sum_exec_runtime;
587};
588
589#define INIT_CPUTIME_ATOMIC \
590 (struct task_cputime_atomic) { \
591 .utime = ATOMIC64_INIT(0), \
592 .stime = ATOMIC64_INIT(0), \
593 .sum_exec_runtime = ATOMIC64_INIT(0), \
594 }
595
570#ifdef CONFIG_PREEMPT_COUNT 596#ifdef CONFIG_PREEMPT_COUNT
571#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) 597#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
572#else 598#else
@@ -584,18 +610,16 @@ struct task_cputime {
584 610
585/** 611/**
586 * struct thread_group_cputimer - thread group interval timer counts 612 * struct thread_group_cputimer - thread group interval timer counts
587 * @cputime: thread group interval timers. 613 * @cputime_atomic: atomic thread group interval timers.
588 * @running: non-zero when there are timers running and 614 * @running: non-zero when there are timers running and
589 * @cputime receives updates. 615 * @cputime receives updates.
590 * @lock: lock for fields in this struct.
591 * 616 *
592 * This structure contains the version of task_cputime, above, that is 617 * This structure contains the version of task_cputime, above, that is
593 * used for thread group CPU timer calculations. 618 * used for thread group CPU timer calculations.
594 */ 619 */
595struct thread_group_cputimer { 620struct thread_group_cputimer {
596 struct task_cputime cputime; 621 struct task_cputime_atomic cputime_atomic;
597 int running; 622 int running;
598 raw_spinlock_t lock;
599}; 623};
600 624
601#include <linux/rwsem.h> 625#include <linux/rwsem.h>
@@ -900,6 +924,50 @@ enum cpu_idle_type {
900#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) 924#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
901 925
902/* 926/*
927 * Wake-queues are lists of tasks with a pending wakeup, whose
928 * callers have already marked the task as woken internally,
929 * and can thus carry on. A common use case is being able to
930 * do the wakeups once the corresponding user lock as been
931 * released.
932 *
933 * We hold reference to each task in the list across the wakeup,
934 * thus guaranteeing that the memory is still valid by the time
935 * the actual wakeups are performed in wake_up_q().
936 *
937 * One per task suffices, because there's never a need for a task to be
938 * in two wake queues simultaneously; it is forbidden to abandon a task
939 * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
940 * already in a wake queue, the wakeup will happen soon and the second
941 * waker can just skip it.
942 *
943 * The WAKE_Q macro declares and initializes the list head.
944 * wake_up_q() does NOT reinitialize the list; it's expected to be
945 * called near the end of a function, where the fact that the queue is
946 * not used again will be easy to see by inspection.
947 *
948 * Note that this can cause spurious wakeups. schedule() callers
949 * must ensure the call is done inside a loop, confirming that the
950 * wakeup condition has in fact occurred.
951 */
952struct wake_q_node {
953 struct wake_q_node *next;
954};
955
956struct wake_q_head {
957 struct wake_q_node *first;
958 struct wake_q_node **lastp;
959};
960
961#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
962
963#define WAKE_Q(name) \
964 struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
965
966extern void wake_q_add(struct wake_q_head *head,
967 struct task_struct *task);
968extern void wake_up_q(struct wake_q_head *head);
969
970/*
903 * sched-domains (multiprocessor balancing) declarations: 971 * sched-domains (multiprocessor balancing) declarations:
904 */ 972 */
905#ifdef CONFIG_SMP 973#ifdef CONFIG_SMP
@@ -1334,8 +1402,6 @@ struct task_struct {
1334 int rcu_read_lock_nesting; 1402 int rcu_read_lock_nesting;
1335 union rcu_special rcu_read_unlock_special; 1403 union rcu_special rcu_read_unlock_special;
1336 struct list_head rcu_node_entry; 1404 struct list_head rcu_node_entry;
1337#endif /* #ifdef CONFIG_PREEMPT_RCU */
1338#ifdef CONFIG_PREEMPT_RCU
1339 struct rcu_node *rcu_blocked_node; 1405 struct rcu_node *rcu_blocked_node;
1340#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1406#endif /* #ifdef CONFIG_PREEMPT_RCU */
1341#ifdef CONFIG_TASKS_RCU 1407#ifdef CONFIG_TASKS_RCU
@@ -1369,7 +1435,7 @@ struct task_struct {
1369 int exit_state; 1435 int exit_state;
1370 int exit_code, exit_signal; 1436 int exit_code, exit_signal;
1371 int pdeath_signal; /* The signal sent when the parent dies */ 1437 int pdeath_signal; /* The signal sent when the parent dies */
1372 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1438 unsigned long jobctl; /* JOBCTL_*, siglock protected */
1373 1439
1374 /* Used for emulating ABI behavior of previous Linux versions */ 1440 /* Used for emulating ABI behavior of previous Linux versions */
1375 unsigned int personality; 1441 unsigned int personality;
@@ -1511,6 +1577,8 @@ struct task_struct {
1511 /* Protection of the PI data structures: */ 1577 /* Protection of the PI data structures: */
1512 raw_spinlock_t pi_lock; 1578 raw_spinlock_t pi_lock;
1513 1579
1580 struct wake_q_node wake_q;
1581
1514#ifdef CONFIG_RT_MUTEXES 1582#ifdef CONFIG_RT_MUTEXES
1515 /* PI waiters blocked on a rt_mutex held by this task */ 1583 /* PI waiters blocked on a rt_mutex held by this task */
1516 struct rb_root pi_waiters; 1584 struct rb_root pi_waiters;
@@ -1724,6 +1792,7 @@ struct task_struct {
1724#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 1792#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1725 unsigned long task_state_change; 1793 unsigned long task_state_change;
1726#endif 1794#endif
1795 int pagefault_disabled;
1727}; 1796};
1728 1797
1729/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1798/* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2077,22 +2146,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
2077#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ 2146#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
2078#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ 2147#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
2079 2148
2080#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) 2149#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
2081#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) 2150#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
2082#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) 2151#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
2083#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) 2152#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT)
2084#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) 2153#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
2085#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) 2154#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
2086#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) 2155#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
2087 2156
2088#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) 2157#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
2089#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) 2158#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
2090 2159
2091extern bool task_set_jobctl_pending(struct task_struct *task, 2160extern bool task_set_jobctl_pending(struct task_struct *task,
2092 unsigned int mask); 2161 unsigned long mask);
2093extern void task_clear_jobctl_trapping(struct task_struct *task); 2162extern void task_clear_jobctl_trapping(struct task_struct *task);
2094extern void task_clear_jobctl_pending(struct task_struct *task, 2163extern void task_clear_jobctl_pending(struct task_struct *task,
2095 unsigned int mask); 2164 unsigned long mask);
2096 2165
2097static inline void rcu_copy_process(struct task_struct *p) 2166static inline void rcu_copy_process(struct task_struct *p)
2098{ 2167{
@@ -2962,11 +3031,6 @@ static __always_inline bool need_resched(void)
2962void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); 3031void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
2963void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); 3032void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
2964 3033
2965static inline void thread_group_cputime_init(struct signal_struct *sig)
2966{
2967 raw_spin_lock_init(&sig->cputimer.lock);
2968}
2969
2970/* 3034/*
2971 * Reevaluate whether the task has signals pending delivery. 3035 * Reevaluate whether the task has signals pending delivery.
2972 * Wake the task if so. 3036 * Wake the task if so.
@@ -3080,13 +3144,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
3080static inline unsigned long task_rlimit(const struct task_struct *tsk, 3144static inline unsigned long task_rlimit(const struct task_struct *tsk,
3081 unsigned int limit) 3145 unsigned int limit)
3082{ 3146{
3083 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); 3147 return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
3084} 3148}
3085 3149
3086static inline unsigned long task_rlimit_max(const struct task_struct *tsk, 3150static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
3087 unsigned int limit) 3151 unsigned int limit)
3088{ 3152{
3089 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); 3153 return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
3090} 3154}
3091 3155
3092static inline unsigned long rlimit(unsigned int limit) 3156static inline unsigned long rlimit(unsigned int limit)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 909b6e43b694..73ddad1e0fa3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu)
191#ifndef topology_core_id 191#ifndef topology_core_id
192#define topology_core_id(cpu) ((void)(cpu), 0) 192#define topology_core_id(cpu) ((void)(cpu), 0)
193#endif 193#endif
194#ifndef topology_thread_cpumask 194#ifndef topology_sibling_cpumask
195#define topology_thread_cpumask(cpu) cpumask_of(cpu) 195#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
196#endif 196#endif
197#ifndef topology_core_cpumask 197#ifndef topology_core_cpumask
198#define topology_core_cpumask(cpu) cpumask_of(cpu) 198#define topology_core_cpumask(cpu) cpumask_of(cpu)
@@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu)
201#ifdef CONFIG_SCHED_SMT 201#ifdef CONFIG_SCHED_SMT
202static inline const struct cpumask *cpu_smt_mask(int cpu) 202static inline const struct cpumask *cpu_smt_mask(int cpu)
203{ 203{
204 return topology_thread_cpumask(cpu); 204 return topology_sibling_cpumask(cpu);
205} 205}
206#endif 206#endif
207 207
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac33..ae572c138607 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,21 +1,30 @@
1#ifndef __LINUX_UACCESS_H__ 1#ifndef __LINUX_UACCESS_H__
2#define __LINUX_UACCESS_H__ 2#define __LINUX_UACCESS_H__
3 3
4#include <linux/preempt.h> 4#include <linux/sched.h>
5#include <asm/uaccess.h> 5#include <asm/uaccess.h>
6 6
7static __always_inline void pagefault_disabled_inc(void)
8{
9 current->pagefault_disabled++;
10}
11
12static __always_inline void pagefault_disabled_dec(void)
13{
14 current->pagefault_disabled--;
15 WARN_ON(current->pagefault_disabled < 0);
16}
17
7/* 18/*
8 * These routines enable/disable the pagefault handler in that 19 * These routines enable/disable the pagefault handler. If disabled, it will
9 * it will not take any locks and go straight to the fixup table. 20 * not take any locks and go straight to the fixup table.
10 * 21 *
11 * They have great resemblance to the preempt_disable/enable calls 22 * User access methods will not sleep when called from a pagefault_disabled()
12 * and in fact they are identical; this is because currently there is 23 * environment.
13 * no other way to make the pagefault handlers do this. So we do
14 * disable preemption but we don't necessarily care about that.
15 */ 24 */
16static inline void pagefault_disable(void) 25static inline void pagefault_disable(void)
17{ 26{
18 preempt_count_inc(); 27 pagefault_disabled_inc();
19 /* 28 /*
20 * make sure to have issued the store before a pagefault 29 * make sure to have issued the store before a pagefault
21 * can hit. 30 * can hit.
@@ -25,18 +34,31 @@ static inline void pagefault_disable(void)
25 34
26static inline void pagefault_enable(void) 35static inline void pagefault_enable(void)
27{ 36{
28#ifndef CONFIG_PREEMPT
29 /* 37 /*
30 * make sure to issue those last loads/stores before enabling 38 * make sure to issue those last loads/stores before enabling
31 * the pagefault handler again. 39 * the pagefault handler again.
32 */ 40 */
33 barrier(); 41 barrier();
34 preempt_count_dec(); 42 pagefault_disabled_dec();
35#else
36 preempt_enable();
37#endif
38} 43}
39 44
45/*
46 * Is the pagefault handler disabled? If so, user access methods will not sleep.
47 */
48#define pagefault_disabled() (current->pagefault_disabled != 0)
49
50/*
51 * The pagefault handler is in general disabled by pagefault_disable() or
52 * when in irq context (via in_atomic()).
53 *
54 * This function should only be used by the fault handlers. Other users should
55 * stick to pagefault_disabled().
56 * Please NEVER use preempt_disable() to disable the fault handler. With
57 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
58 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
59 */
60#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
61
40#ifndef ARCH_HAS_NOCACHE_UACCESS 62#ifndef ARCH_HAS_NOCACHE_UACCESS
41 63
42static inline unsigned long __copy_from_user_inatomic_nocache(void *to, 64static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db83349865b..d69ac4ecc88b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
969 * on that signal. 969 * on that signal.
970 */ 970 */
971static inline int 971static inline int
972wait_on_bit(void *word, int bit, unsigned mode) 972wait_on_bit(unsigned long *word, int bit, unsigned mode)
973{ 973{
974 might_sleep(); 974 might_sleep();
975 if (!test_bit(bit, word)) 975 if (!test_bit(bit, word))
@@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
994 * on that signal. 994 * on that signal.
995 */ 995 */
996static inline int 996static inline int
997wait_on_bit_io(void *word, int bit, unsigned mode) 997wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
998{ 998{
999 might_sleep(); 999 might_sleep();
1000 if (!test_bit(bit, word)) 1000 if (!test_bit(bit, word))
@@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
1020 * received a signal and the mode permitted wakeup on that signal. 1020 * received a signal and the mode permitted wakeup on that signal.
1021 */ 1021 */
1022static inline int 1022static inline int
1023wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) 1023wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
1024 unsigned long timeout)
1024{ 1025{
1025 might_sleep(); 1026 might_sleep();
1026 if (!test_bit(bit, word)) 1027 if (!test_bit(bit, word))
@@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
1047 * on that signal. 1048 * on that signal.
1048 */ 1049 */
1049static inline int 1050static inline int
1050wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) 1051wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
1052 unsigned mode)
1051{ 1053{
1052 might_sleep(); 1054 might_sleep();
1053 if (!test_bit(bit, word)) 1055 if (!test_bit(bit, word))
@@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
1075 * the @mode allows that signal to wake the process. 1077 * the @mode allows that signal to wake the process.
1076 */ 1078 */
1077static inline int 1079static inline int
1078wait_on_bit_lock(void *word, int bit, unsigned mode) 1080wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
1079{ 1081{
1080 might_sleep(); 1082 might_sleep();
1081 if (!test_and_set_bit(bit, word)) 1083 if (!test_and_set_bit(bit, word))
@@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
1099 * the @mode allows that signal to wake the process. 1101 * the @mode allows that signal to wake the process.
1100 */ 1102 */
1101static inline int 1103static inline int
1102wait_on_bit_lock_io(void *word, int bit, unsigned mode) 1104wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
1103{ 1105{
1104 might_sleep(); 1106 might_sleep();
1105 if (!test_and_set_bit(bit, word)) 1107 if (!test_and_set_bit(bit, word))
@@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
1125 * the @mode allows that signal to wake the process. 1127 * the @mode allows that signal to wake the process.
1126 */ 1128 */
1127static inline int 1129static inline int
1128wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) 1130wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
1131 unsigned mode)
1129{ 1132{
1130 might_sleep(); 1133 might_sleep();
1131 if (!test_and_set_bit(bit, word)) 1134 if (!test_and_set_bit(bit, word))
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf3e56a..d57a575fe31f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,8 @@ TRACE_EVENT(sched_switch,
147 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", 147 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
148 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, 148 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
149 { 16, "Z" }, { 32, "X" }, { 64, "x" }, 149 { 16, "Z" }, { 32, "X" }, { 64, "x" },
150 { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", 150 { 128, "K" }, { 256, "W" }, { 512, "P" },
151 { 1024, "N" }) : "R",
151 __entry->prev_state & TASK_STATE_MAX ? "+" : "", 152 __entry->prev_state & TASK_STATE_MAX ? "+" : "",
152 __entry->next_comm, __entry->next_pid, __entry->next_prio) 153 __entry->next_comm, __entry->next_pid, __entry->next_prio)
153); 154);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3aaea7ffd077..a24ba9fe5bb8 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,8 +47,7 @@
47#define RECV 1 47#define RECV 1
48 48
49#define STATE_NONE 0 49#define STATE_NONE 0
50#define STATE_PENDING 1 50#define STATE_READY 1
51#define STATE_READY 2
52 51
53struct posix_msg_tree_node { 52struct posix_msg_tree_node {
54 struct rb_node rb_node; 53 struct rb_node rb_node;
@@ -571,15 +570,12 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
571 wq_add(info, sr, ewp); 570 wq_add(info, sr, ewp);
572 571
573 for (;;) { 572 for (;;) {
574 set_current_state(TASK_INTERRUPTIBLE); 573 __set_current_state(TASK_INTERRUPTIBLE);
575 574
576 spin_unlock(&info->lock); 575 spin_unlock(&info->lock);
577 time = schedule_hrtimeout_range_clock(timeout, 0, 576 time = schedule_hrtimeout_range_clock(timeout, 0,
578 HRTIMER_MODE_ABS, CLOCK_REALTIME); 577 HRTIMER_MODE_ABS, CLOCK_REALTIME);
579 578
580 while (ewp->state == STATE_PENDING)
581 cpu_relax();
582
583 if (ewp->state == STATE_READY) { 579 if (ewp->state == STATE_READY) {
584 retval = 0; 580 retval = 0;
585 goto out; 581 goto out;
@@ -907,11 +903,15 @@ out_name:
907 * list of waiting receivers. A sender checks that list before adding the new 903 * list of waiting receivers. A sender checks that list before adding the new
908 * message into the message array. If there is a waiting receiver, then it 904 * message into the message array. If there is a waiting receiver, then it
909 * bypasses the message array and directly hands the message over to the 905 * bypasses the message array and directly hands the message over to the
910 * receiver. 906 * receiver. The receiver accepts the message and returns without grabbing the
911 * The receiver accepts the message and returns without grabbing the queue 907 * queue spinlock:
912 * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers 908 *
913 * are necessary. The same algorithm is used for sysv semaphores, see 909 * - Set pointer to message.
914 * ipc/sem.c for more details. 910 * - Queue the receiver task for later wakeup (without the info->lock).
911 * - Update its state to STATE_READY. Now the receiver can continue.
912 * - Wake up the process after the lock is dropped. Should the process wake up
913 * before this wakeup (due to a timeout or a signal) it will either see
914 * STATE_READY and continue or acquire the lock to check the state again.
915 * 915 *
916 * The same algorithm is used for senders. 916 * The same algorithm is used for senders.
917 */ 917 */
@@ -919,21 +919,29 @@ out_name:
919/* pipelined_send() - send a message directly to the task waiting in 919/* pipelined_send() - send a message directly to the task waiting in
920 * sys_mq_timedreceive() (without inserting message into a queue). 920 * sys_mq_timedreceive() (without inserting message into a queue).
921 */ 921 */
922static inline void pipelined_send(struct mqueue_inode_info *info, 922static inline void pipelined_send(struct wake_q_head *wake_q,
923 struct mqueue_inode_info *info,
923 struct msg_msg *message, 924 struct msg_msg *message,
924 struct ext_wait_queue *receiver) 925 struct ext_wait_queue *receiver)
925{ 926{
926 receiver->msg = message; 927 receiver->msg = message;
927 list_del(&receiver->list); 928 list_del(&receiver->list);
928 receiver->state = STATE_PENDING; 929 wake_q_add(wake_q, receiver->task);
929 wake_up_process(receiver->task); 930 /*
930 smp_wmb(); 931 * Rely on the implicit cmpxchg barrier from wake_q_add such
932 * that we can ensure that updating receiver->state is the last
933 * write operation: As once set, the receiver can continue,
934 * and if we don't have the reference count from the wake_q,
935 * yet, at that point we can later have a use-after-free
936 * condition and bogus wakeup.
937 */
931 receiver->state = STATE_READY; 938 receiver->state = STATE_READY;
932} 939}
933 940
934/* pipelined_receive() - if there is task waiting in sys_mq_timedsend() 941/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
935 * gets its message and put to the queue (we have one free place for sure). */ 942 * gets its message and put to the queue (we have one free place for sure). */
936static inline void pipelined_receive(struct mqueue_inode_info *info) 943static inline void pipelined_receive(struct wake_q_head *wake_q,
944 struct mqueue_inode_info *info)
937{ 945{
938 struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND); 946 struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
939 947
@@ -944,10 +952,9 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
944 } 952 }
945 if (msg_insert(sender->msg, info)) 953 if (msg_insert(sender->msg, info))
946 return; 954 return;
955
947 list_del(&sender->list); 956 list_del(&sender->list);
948 sender->state = STATE_PENDING; 957 wake_q_add(wake_q, sender->task);
949 wake_up_process(sender->task);
950 smp_wmb();
951 sender->state = STATE_READY; 958 sender->state = STATE_READY;
952} 959}
953 960
@@ -965,6 +972,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
965 struct timespec ts; 972 struct timespec ts;
966 struct posix_msg_tree_node *new_leaf = NULL; 973 struct posix_msg_tree_node *new_leaf = NULL;
967 int ret = 0; 974 int ret = 0;
975 WAKE_Q(wake_q);
968 976
969 if (u_abs_timeout) { 977 if (u_abs_timeout) {
970 int res = prepare_timeout(u_abs_timeout, &expires, &ts); 978 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -1049,7 +1057,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1049 } else { 1057 } else {
1050 receiver = wq_get_first_waiter(info, RECV); 1058 receiver = wq_get_first_waiter(info, RECV);
1051 if (receiver) { 1059 if (receiver) {
1052 pipelined_send(info, msg_ptr, receiver); 1060 pipelined_send(&wake_q, info, msg_ptr, receiver);
1053 } else { 1061 } else {
1054 /* adds message to the queue */ 1062 /* adds message to the queue */
1055 ret = msg_insert(msg_ptr, info); 1063 ret = msg_insert(msg_ptr, info);
@@ -1062,6 +1070,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1062 } 1070 }
1063out_unlock: 1071out_unlock:
1064 spin_unlock(&info->lock); 1072 spin_unlock(&info->lock);
1073 wake_up_q(&wake_q);
1065out_free: 1074out_free:
1066 if (ret) 1075 if (ret)
1067 free_msg(msg_ptr); 1076 free_msg(msg_ptr);
@@ -1149,14 +1158,17 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1149 msg_ptr = wait.msg; 1158 msg_ptr = wait.msg;
1150 } 1159 }
1151 } else { 1160 } else {
1161 WAKE_Q(wake_q);
1162
1152 msg_ptr = msg_get(info); 1163 msg_ptr = msg_get(info);
1153 1164
1154 inode->i_atime = inode->i_mtime = inode->i_ctime = 1165 inode->i_atime = inode->i_mtime = inode->i_ctime =
1155 CURRENT_TIME; 1166 CURRENT_TIME;
1156 1167
1157 /* There is now free space in queue. */ 1168 /* There is now free space in queue. */
1158 pipelined_receive(info); 1169 pipelined_receive(&wake_q, info);
1159 spin_unlock(&info->lock); 1170 spin_unlock(&info->lock);
1171 wake_up_q(&wake_q);
1160 ret = 0; 1172 ret = 0;
1161 } 1173 }
1162 if (ret == 0) { 1174 if (ret == 0) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
1091{ 1091{
1092 unsigned long cpu_limit; 1092 unsigned long cpu_limit;
1093 1093
1094 /* Thread group counters. */ 1094 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1095 thread_group_cputime_init(sig);
1096
1097 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1098 if (cpu_limit != RLIM_INFINITY) { 1095 if (cpu_limit != RLIM_INFINITY) {
1099 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); 1096 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
1100 sig->cputimer.running = 1; 1097 sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1396 p->hardirq_context = 0; 1393 p->hardirq_context = 0;
1397 p->softirq_context = 0; 1394 p->softirq_context = 0;
1398#endif 1395#endif
1396
1397 p->pagefault_disabled = 0;
1398
1399#ifdef CONFIG_LOCKDEP 1399#ifdef CONFIG_LOCKDEP
1400 p->lockdep_depth = 0; /* no locks held yet */ 1400 p->lockdep_depth = 0; /* no locks held yet */
1401 p->curr_chain_key = 0; 1401 p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..f9984c363e9a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
1090 1090
1091/* 1091/*
1092 * The hash bucket lock must be held when this is called. 1092 * The hash bucket lock must be held when this is called.
1093 * Afterwards, the futex_q must not be accessed. 1093 * Afterwards, the futex_q must not be accessed. Callers
1094 * must ensure to later call wake_up_q() for the actual
1095 * wakeups to occur.
1094 */ 1096 */
1095static void wake_futex(struct futex_q *q) 1097static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1096{ 1098{
1097 struct task_struct *p = q->task; 1099 struct task_struct *p = q->task;
1098 1100
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
1100 return; 1102 return;
1101 1103
1102 /* 1104 /*
1103 * We set q->lock_ptr = NULL _before_ we wake up the task. If 1105 * Queue the task for later wakeup for after we've released
1104 * a non-futex wake up happens on another CPU then the task 1106 * the hb->lock. wake_q_add() grabs reference to p.
1105 * might exit and p would dereference a non-existing task
1106 * struct. Prevent this by holding a reference on p across the
1107 * wake up.
1108 */ 1107 */
1109 get_task_struct(p); 1108 wake_q_add(wake_q, p);
1110
1111 __unqueue_futex(q); 1109 __unqueue_futex(q);
1112 /* 1110 /*
1113 * The waiting task can free the futex_q as soon as 1111 * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
1117 */ 1115 */
1118 smp_wmb(); 1116 smp_wmb();
1119 q->lock_ptr = NULL; 1117 q->lock_ptr = NULL;
1120
1121 wake_up_state(p, TASK_NORMAL);
1122 put_task_struct(p);
1123} 1118}
1124 1119
1125static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 1120static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1217 struct futex_q *this, *next; 1212 struct futex_q *this, *next;
1218 union futex_key key = FUTEX_KEY_INIT; 1213 union futex_key key = FUTEX_KEY_INIT;
1219 int ret; 1214 int ret;
1215 WAKE_Q(wake_q);
1220 1216
1221 if (!bitset) 1217 if (!bitset)
1222 return -EINVAL; 1218 return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1244 if (!(this->bitset & bitset)) 1240 if (!(this->bitset & bitset))
1245 continue; 1241 continue;
1246 1242
1247 wake_futex(this); 1243 mark_wake_futex(&wake_q, this);
1248 if (++ret >= nr_wake) 1244 if (++ret >= nr_wake)
1249 break; 1245 break;
1250 } 1246 }
1251 } 1247 }
1252 1248
1253 spin_unlock(&hb->lock); 1249 spin_unlock(&hb->lock);
1250 wake_up_q(&wake_q);
1254out_put_key: 1251out_put_key:
1255 put_futex_key(&key); 1252 put_futex_key(&key);
1256out: 1253out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1269 struct futex_hash_bucket *hb1, *hb2; 1266 struct futex_hash_bucket *hb1, *hb2;
1270 struct futex_q *this, *next; 1267 struct futex_q *this, *next;
1271 int ret, op_ret; 1268 int ret, op_ret;
1269 WAKE_Q(wake_q);
1272 1270
1273retry: 1271retry:
1274 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1272 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
1320 ret = -EINVAL; 1318 ret = -EINVAL;
1321 goto out_unlock; 1319 goto out_unlock;
1322 } 1320 }
1323 wake_futex(this); 1321 mark_wake_futex(&wake_q, this);
1324 if (++ret >= nr_wake) 1322 if (++ret >= nr_wake)
1325 break; 1323 break;
1326 } 1324 }
@@ -1334,7 +1332,7 @@ retry_private:
1334 ret = -EINVAL; 1332 ret = -EINVAL;
1335 goto out_unlock; 1333 goto out_unlock;
1336 } 1334 }
1337 wake_futex(this); 1335 mark_wake_futex(&wake_q, this);
1338 if (++op_ret >= nr_wake2) 1336 if (++op_ret >= nr_wake2)
1339 break; 1337 break;
1340 } 1338 }
@@ -1344,6 +1342,7 @@ retry_private:
1344 1342
1345out_unlock: 1343out_unlock:
1346 double_unlock_hb(hb1, hb2); 1344 double_unlock_hb(hb1, hb2);
1345 wake_up_q(&wake_q);
1347out_put_keys: 1346out_put_keys:
1348 put_futex_key(&key2); 1347 put_futex_key(&key2);
1349out_put_key1: 1348out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1503 struct futex_pi_state *pi_state = NULL; 1502 struct futex_pi_state *pi_state = NULL;
1504 struct futex_hash_bucket *hb1, *hb2; 1503 struct futex_hash_bucket *hb1, *hb2;
1505 struct futex_q *this, *next; 1504 struct futex_q *this, *next;
1505 WAKE_Q(wake_q);
1506 1506
1507 if (requeue_pi) { 1507 if (requeue_pi) {
1508 /* 1508 /*
@@ -1679,7 +1679,7 @@ retry_private:
1679 * woken by futex_unlock_pi(). 1679 * woken by futex_unlock_pi().
1680 */ 1680 */
1681 if (++task_count <= nr_wake && !requeue_pi) { 1681 if (++task_count <= nr_wake && !requeue_pi) {
1682 wake_futex(this); 1682 mark_wake_futex(&wake_q, this);
1683 continue; 1683 continue;
1684 } 1684 }
1685 1685
@@ -1719,6 +1719,7 @@ retry_private:
1719out_unlock: 1719out_unlock:
1720 free_pi_state(pi_state); 1720 free_pi_state(pi_state);
1721 double_unlock_hb(hb1, hb2); 1721 double_unlock_hb(hb1, hb2);
1722 wake_up_q(&wake_q);
1722 hb_waiters_dec(hb2); 1723 hb_waiters_dec(hb2);
1723 1724
1724 /* 1725 /*
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o loadavg.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o idle.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include "sched.h" 1#include "sched.h"
4 2
5#include <linux/proc_fs.h> 3#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
141 139
142 p->signal->autogroup = autogroup_kref_get(ag); 140 p->signal->autogroup = autogroup_kref_get(ag);
143 141
144 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 142 if (!READ_ONCE(sysctl_sched_autogroup_enabled))
145 goto out; 143 goto out;
146 144
147 for_each_thread(p, t) 145 for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
249 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 247 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
250} 248}
251#endif /* CONFIG_SCHED_DEBUG */ 249#endif /* CONFIG_SCHED_DEBUG */
252
253#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
29static inline struct task_group * 29static inline struct task_group *
30autogroup_task_group(struct task_struct *p, struct task_group *tg) 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{ 31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 32 int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
33 33
34 if (enabled && task_wants_autogroup(p, tg)) 34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg; 35 return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291ffb..20b858f2db22 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
511static bool set_nr_if_polling(struct task_struct *p) 511static bool set_nr_if_polling(struct task_struct *p)
512{ 512{
513 struct thread_info *ti = task_thread_info(p); 513 struct thread_info *ti = task_thread_info(p);
514 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); 514 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
515 515
516 for (;;) { 516 for (;;) {
517 if (!(val & _TIF_POLLING_NRFLAG)) 517 if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
541#endif 541#endif
542#endif 542#endif
543 543
544void wake_q_add(struct wake_q_head *head, struct task_struct *task)
545{
546 struct wake_q_node *node = &task->wake_q;
547
548 /*
549 * Atomically grab the task, if ->wake_q is !nil already it means
550 * its already queued (either by us or someone else) and will get the
551 * wakeup due to that.
552 *
553 * This cmpxchg() implies a full barrier, which pairs with the write
554 * barrier implied by the wakeup in wake_up_list().
555 */
556 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
557 return;
558
559 get_task_struct(task);
560
561 /*
562 * The head is context local, there can be no concurrency.
563 */
564 *head->lastp = node;
565 head->lastp = &node->next;
566}
567
568void wake_up_q(struct wake_q_head *head)
569{
570 struct wake_q_node *node = head->first;
571
572 while (node != WAKE_Q_TAIL) {
573 struct task_struct *task;
574
575 task = container_of(node, struct task_struct, wake_q);
576 BUG_ON(!task);
577 /* task can safely be re-inserted now */
578 node = node->next;
579 task->wake_q.next = NULL;
580
581 /*
582 * wake_up_process() implies a wmb() to pair with the queueing
583 * in wake_q_add() so as not to miss wakeups.
584 */
585 wake_up_process(task);
586 put_task_struct(task);
587 }
588}
589
544/* 590/*
545 * resched_curr - mark rq's current task 'to be rescheduled now'. 591 * resched_curr - mark rq's current task 'to be rescheduled now'.
546 * 592 *
@@ -2397,9 +2443,9 @@ unsigned long nr_iowait_cpu(int cpu)
2397 2443
2398void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2444void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2399{ 2445{
2400 struct rq *this = this_rq(); 2446 struct rq *rq = this_rq();
2401 *nr_waiters = atomic_read(&this->nr_iowait); 2447 *nr_waiters = atomic_read(&rq->nr_iowait);
2402 *load = this->cpu_load[0]; 2448 *load = rq->load.weight;
2403} 2449}
2404 2450
2405#ifdef CONFIG_SMP 2451#ifdef CONFIG_SMP
@@ -2497,6 +2543,7 @@ void scheduler_tick(void)
2497 update_rq_clock(rq); 2543 update_rq_clock(rq);
2498 curr->sched_class->task_tick(rq, curr, 0); 2544 curr->sched_class->task_tick(rq, curr, 0);
2499 update_cpu_load_active(rq); 2545 update_cpu_load_active(rq);
2546 calc_global_load_tick(rq);
2500 raw_spin_unlock(&rq->lock); 2547 raw_spin_unlock(&rq->lock);
2501 2548
2502 perf_event_task_tick(); 2549 perf_event_task_tick();
@@ -2525,7 +2572,7 @@ void scheduler_tick(void)
2525u64 scheduler_tick_max_deferment(void) 2572u64 scheduler_tick_max_deferment(void)
2526{ 2573{
2527 struct rq *rq = this_rq(); 2574 struct rq *rq = this_rq();
2528 unsigned long next, now = ACCESS_ONCE(jiffies); 2575 unsigned long next, now = READ_ONCE(jiffies);
2529 2576
2530 next = rq->last_sched_tick + HZ; 2577 next = rq->last_sched_tick + HZ;
2531 2578
@@ -2726,9 +2773,7 @@ again:
2726 * - return from syscall or exception to user-space 2773 * - return from syscall or exception to user-space
2727 * - return from interrupt-handler to user-space 2774 * - return from interrupt-handler to user-space
2728 * 2775 *
2729 * WARNING: all callers must re-check need_resched() afterward and reschedule 2776 * WARNING: must be called with preemption disabled!
2730 * accordingly in case an event triggered the need for rescheduling (such as
2731 * an interrupt waking up a task) while preemption was disabled in __schedule().
2732 */ 2777 */
2733static void __sched __schedule(void) 2778static void __sched __schedule(void)
2734{ 2779{
@@ -2737,7 +2782,6 @@ static void __sched __schedule(void)
2737 struct rq *rq; 2782 struct rq *rq;
2738 int cpu; 2783 int cpu;
2739 2784
2740 preempt_disable();
2741 cpu = smp_processor_id(); 2785 cpu = smp_processor_id();
2742 rq = cpu_rq(cpu); 2786 rq = cpu_rq(cpu);
2743 rcu_note_context_switch(); 2787 rcu_note_context_switch();
@@ -2801,8 +2845,6 @@ static void __sched __schedule(void)
2801 raw_spin_unlock_irq(&rq->lock); 2845 raw_spin_unlock_irq(&rq->lock);
2802 2846
2803 post_schedule(rq); 2847 post_schedule(rq);
2804
2805 sched_preempt_enable_no_resched();
2806} 2848}
2807 2849
2808static inline void sched_submit_work(struct task_struct *tsk) 2850static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2865,9 @@ asmlinkage __visible void __sched schedule(void)
2823 2865
2824 sched_submit_work(tsk); 2866 sched_submit_work(tsk);
2825 do { 2867 do {
2868 preempt_disable();
2826 __schedule(); 2869 __schedule();
2870 sched_preempt_enable_no_resched();
2827 } while (need_resched()); 2871 } while (need_resched());
2828} 2872}
2829EXPORT_SYMBOL(schedule); 2873EXPORT_SYMBOL(schedule);
@@ -2862,15 +2906,14 @@ void __sched schedule_preempt_disabled(void)
2862static void __sched notrace preempt_schedule_common(void) 2906static void __sched notrace preempt_schedule_common(void)
2863{ 2907{
2864 do { 2908 do {
2865 __preempt_count_add(PREEMPT_ACTIVE); 2909 preempt_active_enter();
2866 __schedule(); 2910 __schedule();
2867 __preempt_count_sub(PREEMPT_ACTIVE); 2911 preempt_active_exit();
2868 2912
2869 /* 2913 /*
2870 * Check again in case we missed a preemption opportunity 2914 * Check again in case we missed a preemption opportunity
2871 * between schedule and now. 2915 * between schedule and now.
2872 */ 2916 */
2873 barrier();
2874 } while (need_resched()); 2917 } while (need_resched());
2875} 2918}
2876 2919
@@ -2917,7 +2960,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2917 return; 2960 return;
2918 2961
2919 do { 2962 do {
2920 __preempt_count_add(PREEMPT_ACTIVE); 2963 preempt_active_enter();
2921 /* 2964 /*
2922 * Needs preempt disabled in case user_exit() is traced 2965 * Needs preempt disabled in case user_exit() is traced
2923 * and the tracer calls preempt_enable_notrace() causing 2966 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,8 +2970,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2927 __schedule(); 2970 __schedule();
2928 exception_exit(prev_ctx); 2971 exception_exit(prev_ctx);
2929 2972
2930 __preempt_count_sub(PREEMPT_ACTIVE); 2973 preempt_active_exit();
2931 barrier();
2932 } while (need_resched()); 2974 } while (need_resched());
2933} 2975}
2934EXPORT_SYMBOL_GPL(preempt_schedule_context); 2976EXPORT_SYMBOL_GPL(preempt_schedule_context);
@@ -2952,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
2952 prev_state = exception_enter(); 2994 prev_state = exception_enter();
2953 2995
2954 do { 2996 do {
2955 __preempt_count_add(PREEMPT_ACTIVE); 2997 preempt_active_enter();
2956 local_irq_enable(); 2998 local_irq_enable();
2957 __schedule(); 2999 __schedule();
2958 local_irq_disable(); 3000 local_irq_disable();
2959 __preempt_count_sub(PREEMPT_ACTIVE); 3001 preempt_active_exit();
2960
2961 /*
2962 * Check again in case we missed a preemption opportunity
2963 * between schedule and now.
2964 */
2965 barrier();
2966 } while (need_resched()); 3002 } while (need_resched());
2967 3003
2968 exception_exit(prev_state); 3004 exception_exit(prev_state);
@@ -5314,7 +5350,7 @@ static struct notifier_block migration_notifier = {
5314 .priority = CPU_PRI_MIGRATION, 5350 .priority = CPU_PRI_MIGRATION,
5315}; 5351};
5316 5352
5317static void __cpuinit set_cpu_rq_start_time(void) 5353static void set_cpu_rq_start_time(void)
5318{ 5354{
5319 int cpu = smp_processor_id(); 5355 int cpu = smp_processor_id();
5320 struct rq *rq = cpu_rq(cpu); 5356 struct rq *rq = cpu_rq(cpu);
@@ -7734,11 +7770,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
7734 return rt_runtime_us; 7770 return rt_runtime_us;
7735} 7771}
7736 7772
7737static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7773static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
7738{ 7774{
7739 u64 rt_runtime, rt_period; 7775 u64 rt_runtime, rt_period;
7740 7776
7741 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7777 rt_period = rt_period_us * NSEC_PER_USEC;
7742 rt_runtime = tg->rt_bandwidth.rt_runtime; 7778 rt_runtime = tg->rt_bandwidth.rt_runtime;
7743 7779
7744 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7780 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
567{ 567{
568 cputime_t old; 568 cputime_t old;
569 569
570 while (new > (old = ACCESS_ONCE(*counter))) 570 while (new > (old = READ_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new); 571 cmpxchg_cputime(counter, old, new);
572} 572}
573 573
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..890ce951c717 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
995 rq = cpu_rq(cpu); 995 rq = cpu_rq(cpu);
996 996
997 rcu_read_lock(); 997 rcu_read_lock();
998 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 998 curr = READ_ONCE(rq->curr); /* unlocked access */
999 999
1000 /* 1000 /*
1001 * If we are dealing with a -deadline task, we must 1001 * If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..0d4632f7799b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
141 * 141 *
142 * This idea comes from the SD scheduler of Con Kolivas: 142 * This idea comes from the SD scheduler of Con Kolivas:
143 */ 143 */
144static int get_update_sysctl_factor(void) 144static unsigned int get_update_sysctl_factor(void)
145{ 145{
146 unsigned int cpus = min_t(int, num_online_cpus(), 8); 146 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 unsigned int factor; 147 unsigned int factor;
148 148
149 switch (sysctl_sched_tunable_scaling) { 149 switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
576 loff_t *ppos) 576 loff_t *ppos)
577{ 577{
578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 int factor = get_update_sysctl_factor(); 579 unsigned int factor = get_update_sysctl_factor();
580 580
581 if (ret || !write) 581 if (ret || !write)
582 return ret; 582 return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
834 834
835static unsigned int task_scan_min(struct task_struct *p) 835static unsigned int task_scan_min(struct task_struct *p)
836{ 836{
837 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); 837 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
838 unsigned int scan, floor; 838 unsigned int scan, floor;
839 unsigned int windows = 1; 839 unsigned int windows = 1;
840 840
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
1794 u64 runtime, period; 1794 u64 runtime, period;
1795 spinlock_t *group_lock = NULL; 1795 spinlock_t *group_lock = NULL;
1796 1796
1797 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1797 /*
1798 * The p->mm->numa_scan_seq field gets updated without
1799 * exclusive access. Use READ_ONCE() here to ensure
1800 * that the field is read in a single access:
1801 */
1802 seq = READ_ONCE(p->mm->numa_scan_seq);
1798 if (p->numa_scan_seq == seq) 1803 if (p->numa_scan_seq == seq)
1799 return; 1804 return;
1800 p->numa_scan_seq = seq; 1805 p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1938 } 1943 }
1939 1944
1940 rcu_read_lock(); 1945 rcu_read_lock();
1941 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); 1946 tsk = READ_ONCE(cpu_rq(cpu)->curr);
1942 1947
1943 if (!cpupid_match_pid(tsk, cpupid)) 1948 if (!cpupid_match_pid(tsk, cpupid))
1944 goto no_join; 1949 goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2107 2112
2108static void reset_ptenuma_scan(struct task_struct *p) 2113static void reset_ptenuma_scan(struct task_struct *p)
2109{ 2114{
2110 ACCESS_ONCE(p->mm->numa_scan_seq)++; 2115 /*
2116 * We only did a read acquisition of the mmap sem, so
2117 * p->mm->numa_scan_seq is written to without exclusive access
2118 * and the update is not guaranteed to be atomic. That's not
2119 * much of an issue though, since this is just used for
2120 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2121 * expensive, to avoid any form of compiler optimizations:
2122 */
2123 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2111 p->mm->numa_scan_offset = 0; 2124 p->mm->numa_scan_offset = 0;
2112} 2125}
2113 2126
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4323} 4336}
4324 4337
4325#ifdef CONFIG_SMP 4338#ifdef CONFIG_SMP
4339
4340/*
4341 * per rq 'load' arrray crap; XXX kill this.
4342 */
4343
4344/*
4345 * The exact cpuload at various idx values, calculated at every tick would be
4346 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4347 *
4348 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4349 * on nth tick when cpu may be busy, then we have:
4350 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4351 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4352 *
4353 * decay_load_missed() below does efficient calculation of
4354 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4355 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4356 *
4357 * The calculation is approximated on a 128 point scale.
4358 * degrade_zero_ticks is the number of ticks after which load at any
4359 * particular idx is approximated to be zero.
4360 * degrade_factor is a precomputed table, a row for each load idx.
4361 * Each column corresponds to degradation factor for a power of two ticks,
4362 * based on 128 point scale.
4363 * Example:
4364 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4365 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4366 *
4367 * With this power of 2 load factors, we can degrade the load n times
4368 * by looking at 1 bits in n and doing as many mult/shift instead of
4369 * n mult/shifts needed by the exact degradation.
4370 */
4371#define DEGRADE_SHIFT 7
4372static const unsigned char
4373 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4374static const unsigned char
4375 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4376 {0, 0, 0, 0, 0, 0, 0, 0},
4377 {64, 32, 8, 0, 0, 0, 0, 0},
4378 {96, 72, 40, 12, 1, 0, 0},
4379 {112, 98, 75, 43, 15, 1, 0},
4380 {120, 112, 98, 76, 45, 16, 2} };
4381
4382/*
4383 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4384 * would be when CPU is idle and so we just decay the old load without
4385 * adding any new load.
4386 */
4387static unsigned long
4388decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4389{
4390 int j = 0;
4391
4392 if (!missed_updates)
4393 return load;
4394
4395 if (missed_updates >= degrade_zero_ticks[idx])
4396 return 0;
4397
4398 if (idx == 1)
4399 return load >> missed_updates;
4400
4401 while (missed_updates) {
4402 if (missed_updates % 2)
4403 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4404
4405 missed_updates >>= 1;
4406 j++;
4407 }
4408 return load;
4409}
4410
4411/*
4412 * Update rq->cpu_load[] statistics. This function is usually called every
4413 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4414 * every tick. We fix it up based on jiffies.
4415 */
4416static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4417 unsigned long pending_updates)
4418{
4419 int i, scale;
4420
4421 this_rq->nr_load_updates++;
4422
4423 /* Update our load: */
4424 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4425 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4426 unsigned long old_load, new_load;
4427
4428 /* scale is effectively 1 << i now, and >> i divides by scale */
4429
4430 old_load = this_rq->cpu_load[i];
4431 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4432 new_load = this_load;
4433 /*
4434 * Round up the averaging division if load is increasing. This
4435 * prevents us from getting stuck on 9 if the load is 10, for
4436 * example.
4437 */
4438 if (new_load > old_load)
4439 new_load += scale - 1;
4440
4441 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4442 }
4443
4444 sched_avg_update(this_rq);
4445}
4446
4447#ifdef CONFIG_NO_HZ_COMMON
4448/*
4449 * There is no sane way to deal with nohz on smp when using jiffies because the
4450 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4451 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4452 *
4453 * Therefore we cannot use the delta approach from the regular tick since that
4454 * would seriously skew the load calculation. However we'll make do for those
4455 * updates happening while idle (nohz_idle_balance) or coming out of idle
4456 * (tick_nohz_idle_exit).
4457 *
4458 * This means we might still be one tick off for nohz periods.
4459 */
4460
4461/*
4462 * Called from nohz_idle_balance() to update the load ratings before doing the
4463 * idle balance.
4464 */
4465static void update_idle_cpu_load(struct rq *this_rq)
4466{
4467 unsigned long curr_jiffies = READ_ONCE(jiffies);
4468 unsigned long load = this_rq->cfs.runnable_load_avg;
4469 unsigned long pending_updates;
4470
4471 /*
4472 * bail if there's load or we're actually up-to-date.
4473 */
4474 if (load || curr_jiffies == this_rq->last_load_update_tick)
4475 return;
4476
4477 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4478 this_rq->last_load_update_tick = curr_jiffies;
4479
4480 __update_cpu_load(this_rq, load, pending_updates);
4481}
4482
4483/*
4484 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4485 */
4486void update_cpu_load_nohz(void)
4487{
4488 struct rq *this_rq = this_rq();
4489 unsigned long curr_jiffies = READ_ONCE(jiffies);
4490 unsigned long pending_updates;
4491
4492 if (curr_jiffies == this_rq->last_load_update_tick)
4493 return;
4494
4495 raw_spin_lock(&this_rq->lock);
4496 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4497 if (pending_updates) {
4498 this_rq->last_load_update_tick = curr_jiffies;
4499 /*
4500 * We were idle, this means load 0, the current load might be
4501 * !0 due to remote wakeups and the sort.
4502 */
4503 __update_cpu_load(this_rq, 0, pending_updates);
4504 }
4505 raw_spin_unlock(&this_rq->lock);
4506}
4507#endif /* CONFIG_NO_HZ */
4508
4509/*
4510 * Called from scheduler_tick()
4511 */
4512void update_cpu_load_active(struct rq *this_rq)
4513{
4514 unsigned long load = this_rq->cfs.runnable_load_avg;
4515 /*
4516 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4517 */
4518 this_rq->last_load_update_tick = jiffies;
4519 __update_cpu_load(this_rq, load, 1);
4520}
4521
4326/* Used instead of source_load when we know the type == 0 */ 4522/* Used instead of source_load when we know the type == 0 */
4327static unsigned long weighted_cpuload(const int cpu) 4523static unsigned long weighted_cpuload(const int cpu)
4328{ 4524{
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu)
4375static unsigned long cpu_avg_load_per_task(int cpu) 4571static unsigned long cpu_avg_load_per_task(int cpu)
4376{ 4572{
4377 struct rq *rq = cpu_rq(cpu); 4573 struct rq *rq = cpu_rq(cpu);
4378 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); 4574 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4379 unsigned long load_avg = rq->cfs.runnable_load_avg; 4575 unsigned long load_avg = rq->cfs.runnable_load_avg;
4380 4576
4381 if (nr_running) 4577 if (nr_running)
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
5467} 5663}
5468 5664
5469#ifdef CONFIG_NUMA_BALANCING 5665#ifdef CONFIG_NUMA_BALANCING
5470/* Returns true if the destination node has incurred more faults */ 5666/*
5667 * Returns true if the destination node is the preferred node.
5668 * Needs to match fbq_classify_rq(): if there is a runnable task
5669 * that is not on its preferred node, we should identify it.
5670 */
5471static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5671static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5472{ 5672{
5473 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5673 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5674 unsigned long src_faults, dst_faults;
5474 int src_nid, dst_nid; 5675 int src_nid, dst_nid;
5475 5676
5476 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5677 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5484 if (src_nid == dst_nid) 5685 if (src_nid == dst_nid)
5485 return false; 5686 return false;
5486 5687
5487 if (numa_group) {
5488 /* Task is already in the group's interleave set. */
5489 if (node_isset(src_nid, numa_group->active_nodes))
5490 return false;
5491
5492 /* Task is moving into the group's interleave set. */
5493 if (node_isset(dst_nid, numa_group->active_nodes))
5494 return true;
5495
5496 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5497 }
5498
5499 /* Encourage migration to the preferred node. */ 5688 /* Encourage migration to the preferred node. */
5500 if (dst_nid == p->numa_preferred_nid) 5689 if (dst_nid == p->numa_preferred_nid)
5501 return true; 5690 return true;
5502 5691
5503 return task_faults(p, dst_nid) > task_faults(p, src_nid); 5692 /* Migrating away from the preferred node is bad. */
5693 if (src_nid == p->numa_preferred_nid)
5694 return false;
5695
5696 if (numa_group) {
5697 src_faults = group_faults(p, src_nid);
5698 dst_faults = group_faults(p, dst_nid);
5699 } else {
5700 src_faults = task_faults(p, src_nid);
5701 dst_faults = task_faults(p, dst_nid);
5702 }
5703
5704 return dst_faults > src_faults;
5504} 5705}
5505 5706
5506 5707
5507static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5708static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5508{ 5709{
5509 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5710 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5711 unsigned long src_faults, dst_faults;
5510 int src_nid, dst_nid; 5712 int src_nid, dst_nid;
5511 5713
5512 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5714 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5521 if (src_nid == dst_nid) 5723 if (src_nid == dst_nid)
5522 return false; 5724 return false;
5523 5725
5524 if (numa_group) { 5726 /* Migrating away from the preferred node is bad. */
5525 /* Task is moving within/into the group's interleave set. */ 5727 if (src_nid == p->numa_preferred_nid)
5526 if (node_isset(dst_nid, numa_group->active_nodes)) 5728 return true;
5527 return false;
5528 5729
5529 /* Task is moving out of the group's interleave set. */ 5730 /* Encourage migration to the preferred node. */
5530 if (node_isset(src_nid, numa_group->active_nodes)) 5731 if (dst_nid == p->numa_preferred_nid)
5531 return true; 5732 return false;
5532 5733
5533 return group_faults(p, dst_nid) < group_faults(p, src_nid); 5734 if (numa_group) {
5735 src_faults = group_faults(p, src_nid);
5736 dst_faults = group_faults(p, dst_nid);
5737 } else {
5738 src_faults = task_faults(p, src_nid);
5739 dst_faults = task_faults(p, dst_nid);
5534 } 5740 }
5535 5741
5536 /* Migrating away from the preferred node is always bad. */ 5742 return dst_faults < src_faults;
5537 if (src_nid == p->numa_preferred_nid)
5538 return true;
5539
5540 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5541} 5743}
5542 5744
5543#else 5745#else
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu)
6037 * Since we're reading these variables without serialization make sure 6239 * Since we're reading these variables without serialization make sure
6038 * we read them once before doing sanity checks on them. 6240 * we read them once before doing sanity checks on them.
6039 */ 6241 */
6040 age_stamp = ACCESS_ONCE(rq->age_stamp); 6242 age_stamp = READ_ONCE(rq->age_stamp);
6041 avg = ACCESS_ONCE(rq->rt_avg); 6243 avg = READ_ONCE(rq->rt_avg);
6042 delta = __rq_clock_broken(rq) - age_stamp; 6244 delta = __rq_clock_broken(rq) - age_stamp;
6043 6245
6044 if (unlikely(delta < 0)) 6246 if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f2..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
1/* 1/*
2 * kernel/sched/proc.c 2 * kernel/sched/loadavg.c
3 * 3 *
4 * Kernel load calculations, forked from sched/core.c 4 * This file contains the magic bits required to compute the global loadavg
5 * figure. Its a silly number but people think its important. We go through
6 * great pains to make it work on big machines and tickless kernels.
5 */ 7 */
6 8
7#include <linux/export.h> 9#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
81 long nr_active, delta = 0; 83 long nr_active, delta = 0;
82 84
83 nr_active = this_rq->nr_running; 85 nr_active = this_rq->nr_running;
84 nr_active += (long) this_rq->nr_uninterruptible; 86 nr_active += (long)this_rq->nr_uninterruptible;
85 87
86 if (nr_active != this_rq->calc_load_active) { 88 if (nr_active != this_rq->calc_load_active) {
87 delta = nr_active - this_rq->calc_load_active; 89 delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
186 delta = calc_load_fold_active(this_rq); 188 delta = calc_load_fold_active(this_rq);
187 if (delta) { 189 if (delta) {
188 int idx = calc_load_write_idx(); 190 int idx = calc_load_write_idx();
191
189 atomic_long_add(delta, &calc_load_idle[idx]); 192 atomic_long_add(delta, &calc_load_idle[idx]);
190 } 193 }
191} 194}
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
241{ 244{
242 unsigned long result = 1UL << frac_bits; 245 unsigned long result = 1UL << frac_bits;
243 246
244 if (n) for (;;) { 247 if (n) {
245 if (n & 1) { 248 for (;;) {
246 result *= x; 249 if (n & 1) {
247 result += 1UL << (frac_bits - 1); 250 result *= x;
248 result >>= frac_bits; 251 result += 1UL << (frac_bits - 1);
252 result >>= frac_bits;
253 }
254 n >>= 1;
255 if (!n)
256 break;
257 x *= x;
258 x += 1UL << (frac_bits - 1);
259 x >>= frac_bits;
249 } 260 }
250 n >>= 1;
251 if (!n)
252 break;
253 x *= x;
254 x += 1UL << (frac_bits - 1);
255 x >>= frac_bits;
256 } 261 }
257 262
258 return result; 263 return result;
@@ -285,7 +290,6 @@ static unsigned long
285calc_load_n(unsigned long load, unsigned long exp, 290calc_load_n(unsigned long load, unsigned long exp,
286 unsigned long active, unsigned int n) 291 unsigned long active, unsigned int n)
287{ 292{
288
289 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 293 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
290} 294}
291 295
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
339/* 343/*
340 * calc_load - update the avenrun load estimates 10 ticks after the 344 * calc_load - update the avenrun load estimates 10 ticks after the
341 * CPUs have updated calc_load_tasks. 345 * CPUs have updated calc_load_tasks.
346 *
347 * Called from the global timer code.
342 */ 348 */
343void calc_global_load(unsigned long ticks) 349void calc_global_load(unsigned long ticks)
344{ 350{
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
370} 376}
371 377
372/* 378/*
373 * Called from update_cpu_load() to periodically update this CPU's 379 * Called from scheduler_tick() to periodically update this CPU's
374 * active count. 380 * active count.
375 */ 381 */
376static void calc_load_account_active(struct rq *this_rq) 382void calc_global_load_tick(struct rq *this_rq)
377{ 383{
378 long delta; 384 long delta;
379 385
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
386 392
387 this_rq->calc_load_update += LOAD_FREQ; 393 this_rq->calc_load_update += LOAD_FREQ;
388} 394}
389
390/*
391 * End of global load-average stuff
392 */
393
394/*
395 * The exact cpuload at various idx values, calculated at every tick would be
396 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
397 *
398 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
399 * on nth tick when cpu may be busy, then we have:
400 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
401 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
402 *
403 * decay_load_missed() below does efficient calculation of
404 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
405 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
406 *
407 * The calculation is approximated on a 128 point scale.
408 * degrade_zero_ticks is the number of ticks after which load at any
409 * particular idx is approximated to be zero.
410 * degrade_factor is a precomputed table, a row for each load idx.
411 * Each column corresponds to degradation factor for a power of two ticks,
412 * based on 128 point scale.
413 * Example:
414 * row 2, col 3 (=12) says that the degradation at load idx 2 after
415 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
416 *
417 * With this power of 2 load factors, we can degrade the load n times
418 * by looking at 1 bits in n and doing as many mult/shift instead of
419 * n mult/shifts needed by the exact degradation.
420 */
421#define DEGRADE_SHIFT 7
422static const unsigned char
423 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
424static const unsigned char
425 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
426 {0, 0, 0, 0, 0, 0, 0, 0},
427 {64, 32, 8, 0, 0, 0, 0, 0},
428 {96, 72, 40, 12, 1, 0, 0},
429 {112, 98, 75, 43, 15, 1, 0},
430 {120, 112, 98, 76, 45, 16, 2} };
431
432/*
433 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
434 * would be when CPU is idle and so we just decay the old load without
435 * adding any new load.
436 */
437static unsigned long
438decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
439{
440 int j = 0;
441
442 if (!missed_updates)
443 return load;
444
445 if (missed_updates >= degrade_zero_ticks[idx])
446 return 0;
447
448 if (idx == 1)
449 return load >> missed_updates;
450
451 while (missed_updates) {
452 if (missed_updates % 2)
453 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
454
455 missed_updates >>= 1;
456 j++;
457 }
458 return load;
459}
460
461/*
462 * Update rq->cpu_load[] statistics. This function is usually called every
463 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
464 * every tick. We fix it up based on jiffies.
465 */
466static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
467 unsigned long pending_updates)
468{
469 int i, scale;
470
471 this_rq->nr_load_updates++;
472
473 /* Update our load: */
474 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
475 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
476 unsigned long old_load, new_load;
477
478 /* scale is effectively 1 << i now, and >> i divides by scale */
479
480 old_load = this_rq->cpu_load[i];
481 old_load = decay_load_missed(old_load, pending_updates - 1, i);
482 new_load = this_load;
483 /*
484 * Round up the averaging division if load is increasing. This
485 * prevents us from getting stuck on 9 if the load is 10, for
486 * example.
487 */
488 if (new_load > old_load)
489 new_load += scale - 1;
490
491 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
492 }
493
494 sched_avg_update(this_rq);
495}
496
497#ifdef CONFIG_SMP
498static inline unsigned long get_rq_runnable_load(struct rq *rq)
499{
500 return rq->cfs.runnable_load_avg;
501}
502#else
503static inline unsigned long get_rq_runnable_load(struct rq *rq)
504{
505 return rq->load.weight;
506}
507#endif
508
509#ifdef CONFIG_NO_HZ_COMMON
510/*
511 * There is no sane way to deal with nohz on smp when using jiffies because the
512 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
513 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
514 *
515 * Therefore we cannot use the delta approach from the regular tick since that
516 * would seriously skew the load calculation. However we'll make do for those
517 * updates happening while idle (nohz_idle_balance) or coming out of idle
518 * (tick_nohz_idle_exit).
519 *
520 * This means we might still be one tick off for nohz periods.
521 */
522
523/*
524 * Called from nohz_idle_balance() to update the load ratings before doing the
525 * idle balance.
526 */
527void update_idle_cpu_load(struct rq *this_rq)
528{
529 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
530 unsigned long load = get_rq_runnable_load(this_rq);
531 unsigned long pending_updates;
532
533 /*
534 * bail if there's load or we're actually up-to-date.
535 */
536 if (load || curr_jiffies == this_rq->last_load_update_tick)
537 return;
538
539 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
540 this_rq->last_load_update_tick = curr_jiffies;
541
542 __update_cpu_load(this_rq, load, pending_updates);
543}
544
545/*
546 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
547 */
548void update_cpu_load_nohz(void)
549{
550 struct rq *this_rq = this_rq();
551 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
552 unsigned long pending_updates;
553
554 if (curr_jiffies == this_rq->last_load_update_tick)
555 return;
556
557 raw_spin_lock(&this_rq->lock);
558 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
559 if (pending_updates) {
560 this_rq->last_load_update_tick = curr_jiffies;
561 /*
562 * We were idle, this means load 0, the current load might be
563 * !0 due to remote wakeups and the sort.
564 */
565 __update_cpu_load(this_rq, 0, pending_updates);
566 }
567 raw_spin_unlock(&this_rq->lock);
568}
569#endif /* CONFIG_NO_HZ */
570
571/*
572 * Called from scheduler_tick()
573 */
574void update_cpu_load_active(struct rq *this_rq)
575{
576 unsigned long load = get_rq_runnable_load(this_rq);
577 /*
578 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
579 */
580 this_rq->last_load_update_tick = jiffies;
581 __update_cpu_load(this_rq, load, 1);
582
583 calc_load_account_active(this_rq);
584}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..560d2fa623c3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1323 rq = cpu_rq(cpu); 1323 rq = cpu_rq(cpu);
1324 1324
1325 rcu_read_lock(); 1325 rcu_read_lock();
1326 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 1326 curr = READ_ONCE(rq->curr); /* unlocked access */
1327 1327
1328 /* 1328 /*
1329 * If the current task on @p's runqueue is an RT task, then 1329 * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..d85455539d5c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
26extern unsigned long calc_load_update; 26extern unsigned long calc_load_update;
27extern atomic_long_t calc_load_tasks; 27extern atomic_long_t calc_load_tasks;
28 28
29extern void calc_global_load_tick(struct rq *this_rq);
29extern long calc_load_fold_active(struct rq *this_rq); 30extern long calc_load_fold_active(struct rq *this_rq);
31
32#ifdef CONFIG_SMP
30extern void update_cpu_load_active(struct rq *this_rq); 33extern void update_cpu_load_active(struct rq *this_rq);
34#else
35static inline void update_cpu_load_active(struct rq *this_rq) { }
36#endif
31 37
32/* 38/*
33 * Helpers for converting nanosecond timing to jiffy resolution 39 * Helpers for converting nanosecond timing to jiffy resolution
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
707 713
708static inline u64 __rq_clock_broken(struct rq *rq) 714static inline u64 __rq_clock_broken(struct rq *rq)
709{ 715{
710 return ACCESS_ONCE(rq->clock); 716 return READ_ONCE(rq->clock);
711} 717}
712 718
713static inline u64 rq_clock(struct rq *rq) 719static inline u64 rq_clock(struct rq *rq)
@@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1298 1304
1299unsigned long to_ratio(u64 period, u64 runtime); 1305unsigned long to_ratio(u64 period, u64 runtime);
1300 1306
1301extern void update_idle_cpu_load(struct rq *this_rq);
1302
1303extern void init_task_runnable_average(struct task_struct *p); 1307extern void init_task_runnable_average(struct task_struct *p);
1304 1308
1305static inline void add_nr_running(struct rq *rq, unsigned count) 1309static inline void add_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..077ebbd5e10f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
174{ 174{
175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
176 176
177 if (!cputimer->running) 177 /* Check if cputimer isn't running. This is accessed without locking. */
178 if (!READ_ONCE(cputimer->running))
178 return false; 179 return false;
179 180
180 /* 181 /*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
215 if (!cputimer_running(tsk)) 216 if (!cputimer_running(tsk))
216 return; 217 return;
217 218
218 raw_spin_lock(&cputimer->lock); 219 atomic64_add(cputime, &cputimer->cputime_atomic.utime);
219 cputimer->cputime.utime += cputime;
220 raw_spin_unlock(&cputimer->lock);
221} 220}
222 221
223/** 222/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
238 if (!cputimer_running(tsk)) 237 if (!cputimer_running(tsk))
239 return; 238 return;
240 239
241 raw_spin_lock(&cputimer->lock); 240 atomic64_add(cputime, &cputimer->cputime_atomic.stime);
242 cputimer->cputime.stime += cputime;
243 raw_spin_unlock(&cputimer->lock);
244} 241}
245 242
246/** 243/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
261 if (!cputimer_running(tsk)) 258 if (!cputimer_running(tsk))
262 return; 259 return;
263 260
264 raw_spin_lock(&cputimer->lock); 261 atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
265 cputimer->cputime.sum_exec_runtime += ns;
266 raw_spin_unlock(&cputimer->lock);
267} 262}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..2ccec988d6b7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
601 601
602__sched int bit_wait_timeout(struct wait_bit_key *word) 602__sched int bit_wait_timeout(struct wait_bit_key *word)
603{ 603{
604 unsigned long now = ACCESS_ONCE(jiffies); 604 unsigned long now = READ_ONCE(jiffies);
605 if (signal_pending_state(current->state, current)) 605 if (signal_pending_state(current->state, current))
606 return 1; 606 return 1;
607 if (time_after_eq(now, word->timeout)) 607 if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
613 613
614__sched int bit_wait_io_timeout(struct wait_bit_key *word) 614__sched int bit_wait_io_timeout(struct wait_bit_key *word)
615{ 615{
616 unsigned long now = ACCESS_ONCE(jiffies); 616 unsigned long now = READ_ONCE(jiffies);
617 if (signal_pending_state(current->state, current)) 617 if (signal_pending_state(current->state, current))
618 return 1; 618 return 1;
619 if (time_after_eq(now, word->timeout)) 619 if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..f19833b5db3c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
245 * RETURNS: 245 * RETURNS:
246 * %true if @mask is set, %false if made noop because @task was dying. 246 * %true if @mask is set, %false if made noop because @task was dying.
247 */ 247 */
248bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) 248bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
249{ 249{
250 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | 250 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
251 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); 251 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
297 * CONTEXT: 297 * CONTEXT:
298 * Must be called with @task->sighand->siglock held. 298 * Must be called with @task->sighand->siglock held.
299 */ 299 */
300void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) 300void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
301{ 301{
302 BUG_ON(mask & ~JOBCTL_PENDING_MASK); 302 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
303 303
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
2000 struct signal_struct *sig = current->signal; 2000 struct signal_struct *sig = current->signal;
2001 2001
2002 if (!(current->jobctl & JOBCTL_STOP_PENDING)) { 2002 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
2003 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; 2003 unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
2004 struct task_struct *t; 2004 struct task_struct *t;
2005 2005
2006 /* signr will be recorded in task->jobctl for retries */ 2006 /* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196 return 0; 196 return 0;
197} 197}
198 198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 199/*
200 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
201 * to avoid race conditions with concurrent updates to cputime.
202 */
203static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200{ 204{
201 if (b->utime > a->utime) 205 u64 curr_cputime;
202 a->utime = b->utime; 206retry:
207 curr_cputime = atomic64_read(cputime);
208 if (sum_cputime > curr_cputime) {
209 if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
210 goto retry;
211 }
212}
203 213
204 if (b->stime > a->stime) 214static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
205 a->stime = b->stime; 215{
216 __update_gt_cputime(&cputime_atomic->utime, sum->utime);
217 __update_gt_cputime(&cputime_atomic->stime, sum->stime);
218 __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
219}
206 220
207 if (b->sum_exec_runtime > a->sum_exec_runtime) 221/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
208 a->sum_exec_runtime = b->sum_exec_runtime; 222static inline void sample_cputime_atomic(struct task_cputime *times,
223 struct task_cputime_atomic *atomic_times)
224{
225 times->utime = atomic64_read(&atomic_times->utime);
226 times->stime = atomic64_read(&atomic_times->stime);
227 times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
209} 228}
210 229
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) 230void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{ 231{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 232 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum; 233 struct task_cputime sum;
215 unsigned long flags;
216 234
217 if (!cputimer->running) { 235 /* Check if cputimer isn't running. This is accessed without locking. */
236 if (!READ_ONCE(cputimer->running)) {
218 /* 237 /*
219 * The POSIX timer interface allows for absolute time expiry 238 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have 239 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start 240 * to synchronize the timer to the clock every time we start it.
222 * it.
223 */ 241 */
224 thread_group_cputime(tsk, &sum); 242 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags); 243 update_gt_cputime(&cputimer->cputime_atomic, &sum);
226 cputimer->running = 1; 244
227 update_gt_cputime(&cputimer->cputime, &sum); 245 /*
228 } else 246 * We're setting cputimer->running without a lock. Ensure
229 raw_spin_lock_irqsave(&cputimer->lock, flags); 247 * this only gets written to in one operation. We set
230 *times = cputimer->cputime; 248 * running after update_gt_cputime() as a small optimization,
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 249 * but barriers are not required because update_gt_cputime()
250 * can handle concurrent updates.
251 */
252 WRITE_ONCE(cputimer->running, 1);
253 }
254 sample_cputime_atomic(times, &cputimer->cputime_atomic);
232} 255}
233 256
234/* 257/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582 if (!task_cputime_zero(&tsk->cputime_expires)) 605 if (!task_cputime_zero(&tsk->cputime_expires))
583 return false; 606 return false;
584 607
585 if (tsk->signal->cputimer.running) 608 /* Check if cputimer is running. This is accessed without locking. */
609 if (READ_ONCE(tsk->signal->cputimer.running))
586 return false; 610 return false;
587 611
588 return true; 612 return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
852 /* 876 /*
853 * Check for the special case thread timers. 877 * Check for the special case thread timers.
854 */ 878 */
855 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); 879 soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
856 if (soft != RLIM_INFINITY) { 880 if (soft != RLIM_INFINITY) {
857 unsigned long hard = 881 unsigned long hard =
858 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); 882 READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
859 883
860 if (hard != RLIM_INFINITY && 884 if (hard != RLIM_INFINITY &&
861 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 885 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882 } 906 }
883} 907}
884 908
885static void stop_process_timers(struct signal_struct *sig) 909static inline void stop_process_timers(struct signal_struct *sig)
886{ 910{
887 struct thread_group_cputimer *cputimer = &sig->cputimer; 911 struct thread_group_cputimer *cputimer = &sig->cputimer;
888 unsigned long flags;
889 912
890 raw_spin_lock_irqsave(&cputimer->lock, flags); 913 /* Turn off cputimer->running. This is done without locking. */
891 cputimer->running = 0; 914 WRITE_ONCE(cputimer->running, 0);
892 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
893} 915}
894 916
895static u32 onecputick; 917static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
958 SIGPROF); 980 SIGPROF);
959 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 981 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
960 SIGVTALRM); 982 SIGVTALRM);
961 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); 983 soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
962 if (soft != RLIM_INFINITY) { 984 if (soft != RLIM_INFINITY) {
963 unsigned long psecs = cputime_to_secs(ptime); 985 unsigned long psecs = cputime_to_secs(ptime);
964 unsigned long hard = 986 unsigned long hard =
965 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); 987 READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
966 cputime_t x; 988 cputime_t x;
967 if (psecs >= hard) { 989 if (psecs >= hard) {
968 /* 990 /*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111 } 1133 }
1112 1134
1113 sig = tsk->signal; 1135 sig = tsk->signal;
1114 if (sig->cputimer.running) { 1136 /* Check if cputimer is running. This is accessed without locking. */
1137 if (READ_ONCE(sig->cputimer.running)) {
1115 struct task_cputime group_sample; 1138 struct task_cputime group_sample;
1116 1139
1117 raw_spin_lock(&sig->cputimer.lock); 1140 sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
1118 group_sample = sig->cputimer.cputime;
1119 raw_spin_unlock(&sig->cputimer.lock);
1120 1141
1121 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1142 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1122 return 1; 1143 return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1157 * If there are any active process wide timers (POSIX 1.b, itimers, 1178 * If there are any active process wide timers (POSIX 1.b, itimers,
1158 * RLIMIT_CPU) cputimer must be running. 1179 * RLIMIT_CPU) cputimer must be running.
1159 */ 1180 */
1160 if (tsk->signal->cputimer.running) 1181 if (READ_ONCE(tsk->signal->cputimer.running))
1161 check_process_timers(tsk, &firing); 1182 check_process_timers(tsk, &firing);
1162 1183
1163 /* 1184 /*
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 4f134d8907a7..f610b2a10b3e 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -191,7 +191,7 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
191 /* Update distances based on topology */ 191 /* Update distances based on topology */
192 for_each_cpu(cpu, update_mask) { 192 for_each_cpu(cpu, update_mask) {
193 if (cpu_rmap_copy_neigh(rmap, cpu, 193 if (cpu_rmap_copy_neigh(rmap, cpu,
194 topology_thread_cpumask(cpu), 1)) 194 topology_sibling_cpumask(cpu), 1))
195 continue; 195 continue;
196 if (cpu_rmap_copy_neigh(rmap, cpu, 196 if (cpu_rmap_copy_neigh(rmap, cpu,
197 topology_core_cpumask(cpu), 2)) 197 topology_core_cpumask(cpu), 2))
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3d2aa27b845b..061550de77bc 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/bitops.h> 34#include <linux/bitops.h>
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/preempt_mask.h> /* in_interrupt() */ 36#include <linux/preempt.h> /* in_interrupt() */
37 37
38 38
39/* 39/*
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index a28df5206d95..36c15a2889e4 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -84,7 +84,8 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
84 * @str: The string to measure. 84 * @str: The string to measure.
85 * @count: Maximum count (including NUL character) 85 * @count: Maximum count (including NUL character)
86 * 86 *
87 * Context: User context only. This function may sleep. 87 * Context: User context only. This function may sleep if pagefaults are
88 * enabled.
88 * 89 *
89 * Get the size of a NUL-terminated string in user space. 90 * Get the size of a NUL-terminated string in user space.
90 * 91 *
@@ -113,7 +114,8 @@ EXPORT_SYMBOL(strnlen_user);
113 * strlen_user: - Get the size of a user string INCLUDING final NUL. 114 * strlen_user: - Get the size of a user string INCLUDING final NUL.
114 * @str: The string to measure. 115 * @str: The string to measure.
115 * 116 *
116 * Context: User context only. This function may sleep. 117 * Context: User context only. This function may sleep if pagefaults are
118 * enabled.
117 * 119 *
118 * Get the size of a NUL-terminated string in user space. 120 * Get the size of a NUL-terminated string in user space.
119 * 121 *
diff --git a/mm/memory.c b/mm/memory.c
index 22e037e3364e..17734c3c1183 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3737,7 +3737,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3737} 3737}
3738 3738
3739#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) 3739#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3740void might_fault(void) 3740void __might_fault(const char *file, int line)
3741{ 3741{
3742 /* 3742 /*
3743 * Some code (nfs/sunrpc) uses socket ops on kernel memory while 3743 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3747,21 +3747,15 @@ void might_fault(void)
3747 */ 3747 */
3748 if (segment_eq(get_fs(), KERNEL_DS)) 3748 if (segment_eq(get_fs(), KERNEL_DS))
3749 return; 3749 return;
3750 3750 if (pagefault_disabled())
3751 /*
3752 * it would be nicer only to annotate paths which are not under
3753 * pagefault_disable, however that requires a larger audit and
3754 * providing helpers like get_user_atomic.
3755 */
3756 if (in_atomic())
3757 return; 3751 return;
3758 3752 __might_sleep(file, line, 0);
3759 __might_sleep(__FILE__, __LINE__, 0); 3753#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3760
3761 if (current->mm) 3754 if (current->mm)
3762 might_lock_read(&current->mm->mmap_sem); 3755 might_lock_read(&current->mm->mmap_sem);
3756#endif
3763} 3757}
3764EXPORT_SYMBOL(might_fault); 3758EXPORT_SYMBOL(__might_fault);
3765#endif 3759#endif
3766 3760
3767#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 3761#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)