aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/kernel.txt76
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/arc/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/avr32/include/asm/Kbuild1
-rw-r--r--arch/blackfin/include/asm/Kbuild1
-rw-r--r--arch/c6x/include/asm/Kbuild1
-rw-r--r--arch/cris/include/asm/Kbuild1
-rw-r--r--arch/frv/include/asm/Kbuild1
-rw-r--r--arch/h8300/include/asm/Kbuild1
-rw-r--r--arch/hexagon/include/asm/Kbuild1
-rw-r--r--arch/ia64/include/asm/Kbuild1
-rw-r--r--arch/m32r/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/metag/include/asm/Kbuild1
-rw-r--r--arch/metag/include/asm/topology.h2
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/mips/include/asm/Kbuild1
-rw-r--r--arch/mips/kernel/rtlx.c19
-rw-r--r--arch/mips/mm/init.c5
-rw-r--r--arch/mn10300/include/asm/Kbuild1
-rw-r--r--arch/openrisc/include/asm/Kbuild1
-rw-r--r--arch/parisc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/s390/include/asm/Kbuild1
-rw-r--r--arch/score/include/asm/Kbuild1
-rw-r--r--arch/sh/include/asm/Kbuild1
-rw-r--r--arch/sparc/include/asm/Kbuild1
-rw-r--r--arch/tile/include/asm/Kbuild1
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/unicore32/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/atomic.h29
-rw-r--r--arch/x86/include/asm/atomic64_64.h28
-rw-r--r--arch/x86/include/asm/bitops.h24
-rw-r--r--arch/x86/include/asm/calling.h50
-rw-r--r--arch/x86/include/asm/local.h28
-rw-r--r--arch/x86/include/asm/preempt.h100
-rw-r--r--arch/x86/include/asm/rmwcc.h41
-rw-r--r--arch/x86/include/asm/thread_info.h5
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/entry_32.S7
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c7
-rw-r--r--arch/x86/kernel/irq_32.c4
-rw-r--r--arch/x86/kernel/preempt.S25
-rw-r--r--arch/x86/kernel/process.c6
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c7
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--drivers/acpi/processor_idle.c46
-rw-r--r--drivers/idle/intel_idle.c2
-rw-r--r--fs/exec.c1
-rw-r--r--fs/proc/array.c2
-rw-r--r--include/asm-generic/preempt.h105
-rw-r--r--include/linux/hardirq.h8
-rw-r--r--include/linux/mempolicy.h1
-rw-r--r--include/linux/migrate.h7
-rw-r--r--include/linux/mm.h118
-rw-r--r--include/linux/mm_types.h17
-rw-r--r--include/linux/page-flags-layout.h28
-rw-r--r--include/linux/preempt.h112
-rw-r--r--include/linux/sched.h167
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/stop_machine.h1
-rw-r--r--include/linux/thread_info.h17
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/linux/tty.h28
-rw-r--r--include/linux/uaccess.h8
-rw-r--r--include/linux/wait.h374
-rw-r--r--include/trace/events/sched.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/bounds.c4
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c17
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/rcutree.c15
-rw-r--r--kernel/sched/core.c290
-rw-r--r--kernel/sched/debug.c68
-rw-r--r--kernel/sched/fair.c1359
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h52
-rw-r--r--kernel/sched/stats.h46
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/softirq.c16
-rw-r--r--kernel/stop_machine.c288
-rw-r--r--kernel/sysctl.c21
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/wait.c24
-rw-r--r--lib/locking-selftest.c2
-rw-r--r--lib/smp_processor_id.c3
-rw-r--r--mm/huge_memory.c55
-rw-r--r--mm/memory.c139
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c30
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c65
-rw-r--r--mm/page_alloc.c4
-rw-r--r--net/irda/af_irda.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c7
109 files changed, 3098 insertions, 1181 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 9d4c1d18ad44..4273b2d71a27 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -355,6 +355,82 @@ utilize.
355 355
356============================================================== 356==============================================================
357 357
358numa_balancing
359
360Enables/disables automatic page fault based NUMA memory
361balancing. Memory is moved automatically to nodes
362that access it often.
363
364Enables/disables automatic NUMA memory balancing. On NUMA machines, there
365is a performance penalty if remote memory is accessed by a CPU. When this
366feature is enabled the kernel samples what task thread is accessing memory
367by periodically unmapping pages and later trapping a page fault. At the
368time of the page fault, it is determined if the data being accessed should
369be migrated to a local memory node.
370
371The unmapping of pages and trapping faults incur additional overhead that
372ideally is offset by improved memory locality but there is no universal
373guarantee. If the target workload is already bound to NUMA nodes then this
374feature should be disabled. Otherwise, if the system overhead from the
375feature is too high then the rate the kernel samples for NUMA hinting
376faults may be controlled by the numa_balancing_scan_period_min_ms,
377numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
378numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
379numa_balancing_migrate_deferred.
380
381==============================================================
382
383numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms,
384numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
385
386Automatic NUMA balancing scans tasks address space and unmaps pages to
387detect if pages are properly placed or if the data should be migrated to a
388memory node local to where the task is running. Every "scan delay" the task
389scans the next "scan size" number of pages in its address space. When the
390end of the address space is reached the scanner restarts from the beginning.
391
392In combination, the "scan delay" and "scan size" determine the scan rate.
393When "scan delay" decreases, the scan rate increases. The scan delay and
394hence the scan rate of every task is adaptive and depends on historical
395behaviour. If pages are properly placed then the scan delay increases,
396otherwise the scan delay decreases. The "scan size" is not adaptive but
397the higher the "scan size", the higher the scan rate.
398
399Higher scan rates incur higher system overhead as page faults must be
400trapped and potentially data must be migrated. However, the higher the scan
401rate, the more quickly a tasks memory is migrated to a local node if the
402workload pattern changes and minimises performance impact due to remote
403memory accesses. These sysctls control the thresholds for scan delays and
404the number of pages scanned.
405
406numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
407scan a tasks virtual memory. It effectively controls the maximum scanning
408rate for each task.
409
410numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
411when it initially forks.
412
413numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
414scan a tasks virtual memory. It effectively controls the minimum scanning
415rate for each task.
416
417numa_balancing_scan_size_mb is how many megabytes worth of pages are
418scanned for a given scan.
419
420numa_balancing_settle_count is how many scan periods must complete before
421the schedule balancer stops pushing the task towards a preferred node. This
422gives the scheduler a chance to place the task on an alternative node if the
423preferred node is overloaded.
424
425numa_balancing_migrate_deferred is how many page migrations get skipped
426unconditionally, after a page migration is skipped because a page is shared
427with other tasks. This reduces page migration overhead, and determines
428how much stronger the "move task near its memory" policy scheduler becomes,
429versus the "move memory near its task" memory management policy, for workloads
430with shared memory.
431
432==============================================================
433
358osrelease, ostype & version: 434osrelease, ostype & version:
359 435
360# cat osrelease 436# cat osrelease
diff --git a/MAINTAINERS b/MAINTAINERS
index 3438384d270c..dcd69cb34806 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7303,6 +7303,8 @@ S: Maintained
7303F: kernel/sched/ 7303F: kernel/sched/
7304F: include/linux/sched.h 7304F: include/linux/sched.h
7305F: include/uapi/linux/sched.h 7305F: include/uapi/linux/sched.h
7306F: kernel/wait.c
7307F: include/linux/wait.h
7306 7308
7307SCORE ARCHITECTURE 7309SCORE ARCHITECTURE
7308M: Chen Liqin <liqin.linux@gmail.com> 7310M: Chen Liqin <liqin.linux@gmail.com>
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index a6e85f448c1c..f01fb505ad52 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
3 3
4generic-y += exec.h 4generic-y += exec.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index d8dd660898b9..5943f7f9d325 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -46,3 +46,4 @@ generic-y += ucontext.h
46generic-y += user.h 46generic-y += user.h
47generic-y += vga.h 47generic-y += vga.h
48generic-y += xor.h 48generic-y += xor.h
49generic-y += preempt.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 59ceae8f3c95..1a7024b41351 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -32,3 +32,4 @@ generic-y += termios.h
32generic-y += timex.h 32generic-y += timex.h
33generic-y += trace_clock.h 33generic-y += trace_clock.h
34generic-y += unaligned.h 34generic-y += unaligned.h
35generic-y += preempt.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 79a642d199f2..519f89f5b6a3 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -50,3 +50,4 @@ generic-y += unaligned.h
50generic-y += user.h 50generic-y += user.h
51generic-y += vga.h 51generic-y += vga.h
52generic-y += xor.h 52generic-y += xor.h
53generic-y += preempt.h
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
index fd7980743890..658001b52400 100644
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y += div64.h
7generic-y += emergency-restart.h 7generic-y += emergency-restart.h
8generic-y += exec.h 8generic-y += exec.h
9generic-y += futex.h 9generic-y += futex.h
10generic-y += preempt.h
10generic-y += irq_regs.h 11generic-y += irq_regs.h
11generic-y += param.h 12generic-y += param.h
12generic-y += local.h 13generic-y += local.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 127826f8a375..f2b43474b0e2 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -44,3 +44,4 @@ generic-y += ucontext.h
44generic-y += unaligned.h 44generic-y += unaligned.h
45generic-y += user.h 45generic-y += user.h
46generic-y += xor.h 46generic-y += xor.h
47generic-y += preempt.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index e49f918531ad..fc0b3c356027 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -56,3 +56,4 @@ generic-y += ucontext.h
56generic-y += user.h 56generic-y += user.h
57generic-y += vga.h 57generic-y += vga.h
58generic-y += xor.h 58generic-y += xor.h
59generic-y += preempt.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index c8325455520e..b06caf649a95 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += module.h
11generic-y += trace_clock.h 11generic-y += trace_clock.h
12generic-y += vga.h 12generic-y += vga.h
13generic-y += xor.h 13generic-y += xor.h
14generic-y += preempt.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index c5d767028306..74742dc6a3da 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -2,3 +2,4 @@
2generic-y += clkdev.h 2generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 8ada3cf0c98d..7e0e7213a481 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -6,3 +6,4 @@ generic-y += mmu.h
6generic-y += module.h 6generic-y += module.h
7generic-y += trace_clock.h 7generic-y += trace_clock.h
8generic-y += xor.h 8generic-y += xor.h
9generic-y += preempt.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 1da17caac23c..67c3450309b7 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -53,3 +53,4 @@ generic-y += types.h
53generic-y += ucontext.h 53generic-y += ucontext.h
54generic-y += unaligned.h 54generic-y += unaligned.h
55generic-y += xor.h 55generic-y += xor.h
56generic-y += preempt.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index a3456f34f672..f93ee087e8fe 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -3,4 +3,5 @@ generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += kvm_para.h 4generic-y += kvm_para.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
6generic-y += vtime.h \ No newline at end of file 7generic-y += vtime.h \ No newline at end of file
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index bebdc36ebb0a..2b58c5f0bc38 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += module.h 4generic-y += module.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 09d77a862da3..a5d27f272a59 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -31,3 +31,4 @@ generic-y += trace_clock.h
31generic-y += types.h 31generic-y += types.h
32generic-y += word-at-a-time.h 32generic-y += word-at-a-time.h
33generic-y += xor.h 33generic-y += xor.h
34generic-y += preempt.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 6ae0ccb632cb..84d0c1d6b9b3 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -52,3 +52,4 @@ generic-y += unaligned.h
52generic-y += user.h 52generic-y += user.h
53generic-y += vga.h 53generic-y += vga.h
54generic-y += xor.h 54generic-y += xor.h
55generic-y += preempt.h
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h
index 23f5118f58db..8e9c0b3b9691 100644
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -26,6 +26,8 @@
26 .last_balance = jiffies, \ 26 .last_balance = jiffies, \
27 .balance_interval = 1, \ 27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \ 28 .nr_balance_failed = 0, \
29 .max_newidle_lb_cost = 0, \
30 .next_decay_max_lb_cost = jiffies, \
29} 31}
30 32
31#define cpu_to_node(cpu) ((void)(cpu), 0) 33#define cpu_to_node(cpu) ((void)(cpu), 0)
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index d3c51a6a601d..ce0bbf8f5640 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += syscalls.h 5generic-y += syscalls.h
6generic-y += preempt.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 454ddf9bb76f..1acbb8b77a71 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -11,5 +11,6 @@ generic-y += sections.h
11generic-y += segment.h 11generic-y += segment.h
12generic-y += serial.h 12generic-y += serial.h
13generic-y += trace_clock.h 13generic-y += trace_clock.h
14generic-y += preempt.h
14generic-y += ucontext.h 15generic-y += ucontext.h
15generic-y += xor.h 16generic-y += xor.h
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c
index d763f11e35e2..2c12ea1668d1 100644
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -172,8 +172,9 @@ int rtlx_open(int index, int can_sleep)
172 if (rtlx == NULL) { 172 if (rtlx == NULL) {
173 if( (p = vpe_get_shared(tclimit)) == NULL) { 173 if( (p = vpe_get_shared(tclimit)) == NULL) {
174 if (can_sleep) { 174 if (can_sleep) {
175 __wait_event_interruptible(channel_wqs[index].lx_queue, 175 ret = __wait_event_interruptible(
176 (p = vpe_get_shared(tclimit)), ret); 176 channel_wqs[index].lx_queue,
177 (p = vpe_get_shared(tclimit)));
177 if (ret) 178 if (ret)
178 goto out_fail; 179 goto out_fail;
179 } else { 180 } else {
@@ -263,11 +264,10 @@ unsigned int rtlx_read_poll(int index, int can_sleep)
263 /* data available to read? */ 264 /* data available to read? */
264 if (chan->lx_read == chan->lx_write) { 265 if (chan->lx_read == chan->lx_write) {
265 if (can_sleep) { 266 if (can_sleep) {
266 int ret = 0; 267 int ret = __wait_event_interruptible(
267 268 channel_wqs[index].lx_queue,
268 __wait_event_interruptible(channel_wqs[index].lx_queue,
269 (chan->lx_read != chan->lx_write) || 269 (chan->lx_read != chan->lx_write) ||
270 sp_stopping, ret); 270 sp_stopping);
271 if (ret) 271 if (ret)
272 return ret; 272 return ret;
273 273
@@ -440,14 +440,13 @@ static ssize_t file_write(struct file *file, const char __user * buffer,
440 440
441 /* any space left... */ 441 /* any space left... */
442 if (!rtlx_write_poll(minor)) { 442 if (!rtlx_write_poll(minor)) {
443 int ret = 0; 443 int ret;
444 444
445 if (file->f_flags & O_NONBLOCK) 445 if (file->f_flags & O_NONBLOCK)
446 return -EAGAIN; 446 return -EAGAIN;
447 447
448 __wait_event_interruptible(channel_wqs[minor].rt_queue, 448 ret = __wait_event_interruptible(channel_wqs[minor].rt_queue,
449 rtlx_write_poll(minor), 449 rtlx_write_poll(minor));
450 ret);
451 if (ret) 450 if (ret)
452 return ret; 451 return ret;
453 } 452 }
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index e205ef598e97..12156176c7ca 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr)
124 124
125 BUG_ON(Page_dcache_dirty(page)); 125 BUG_ON(Page_dcache_dirty(page));
126 126
127 inc_preempt_count(); 127 pagefault_disable();
128 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); 128 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
129#ifdef CONFIG_MIPS_MT_SMTC 129#ifdef CONFIG_MIPS_MT_SMTC
130 idx += FIX_N_COLOURS * smp_processor_id() + 130 idx += FIX_N_COLOURS * smp_processor_id() +
@@ -193,8 +193,7 @@ void kunmap_coherent(void)
193 write_c0_entryhi(old_ctx); 193 write_c0_entryhi(old_ctx);
194 EXIT_CRITICAL(flags); 194 EXIT_CRITICAL(flags);
195#endif 195#endif
196 dec_preempt_count(); 196 pagefault_enable();
197 preempt_check_resched();
198} 197}
199 198
200void copy_user_highpage(struct page *to, struct page *from, 199void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index c5d767028306..74742dc6a3da 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -2,3 +2,4 @@
2generic-y += clkdev.h 2generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 195653e851da..78405625e799 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -67,3 +67,4 @@ generic-y += ucontext.h
67generic-y += user.h 67generic-y += user.h
68generic-y += word-at-a-time.h 68generic-y += word-at-a-time.h
69generic-y += xor.h 69generic-y += xor.h
70generic-y += preempt.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index ff4c9faed546..a603b9ebe54c 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \
4 div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ 4 div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \
5 poll.h xor.h clkdev.h exec.h 5 poll.h xor.h clkdev.h exec.h
6generic-y += trace_clock.h 6generic-y += trace_clock.h
7generic-y += preempt.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 704e6f10ae80..d8f9d2f18a23 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -2,4 +2,5 @@
2generic-y += clkdev.h 2generic-y += clkdev.h
3generic-y += rwsem.h 3generic-y += rwsem.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
5generic-y += vtime.h \ No newline at end of file 6generic-y += vtime.h \ No newline at end of file
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index f313f9cbcf44..7a5288f3479a 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -2,3 +2,4 @@
2 2
3generic-y += clkdev.h 3generic-y += clkdev.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index e1c7bb999b06..f3414ade77a3 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -4,3 +4,4 @@ header-y +=
4generic-y += clkdev.h 4generic-y += clkdev.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += xor.h 6generic-y += xor.h
7generic-y += preempt.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 280bea9e5e2b..231efbb68108 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -34,3 +34,4 @@ generic-y += termios.h
34generic-y += trace_clock.h 34generic-y += trace_clock.h
35generic-y += ucontext.h 35generic-y += ucontext.h
36generic-y += xor.h 36generic-y += xor.h
37generic-y += preempt.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 7e4a97fbded4..bf390667657a 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,3 +16,4 @@ generic-y += serial.h
16generic-y += trace_clock.h 16generic-y += trace_clock.h
17generic-y += types.h 17generic-y += types.h
18generic-y += word-at-a-time.h 18generic-y += word-at-a-time.h
19generic-y += preempt.h
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 664d6ad23f80..22f3bd147fa7 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -38,3 +38,4 @@ generic-y += termios.h
38generic-y += trace_clock.h 38generic-y += trace_clock.h
39generic-y += types.h 39generic-y += types.h
40generic-y += xor.h 40generic-y += xor.h
41generic-y += preempt.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b30f34a79882..fdde187e6087 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h
3generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h 3generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
4generic-y += switch_to.h clkdev.h 4generic-y += switch_to.h clkdev.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 89d8b6c4e39a..00045cbe5c63 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -60,3 +60,4 @@ generic-y += unaligned.h
60generic-y += user.h 60generic-y += user.h
61generic-y += vga.h 61generic-y += vga.h
62generic-y += xor.h 62generic-y += xor.h
63generic-y += preempt.h
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 722aa3b04624..da31c8b8a92d 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/alternative.h> 7#include <asm/alternative.h>
8#include <asm/cmpxchg.h> 8#include <asm/cmpxchg.h>
9#include <asm/rmwcc.h>
9 10
10/* 11/*
11 * Atomic operations that C can't guarantee us. Useful for 12 * Atomic operations that C can't guarantee us. Useful for
@@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v)
76 */ 77 */
77static inline int atomic_sub_and_test(int i, atomic_t *v) 78static inline int atomic_sub_and_test(int i, atomic_t *v)
78{ 79{
79 unsigned char c; 80 GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e");
80
81 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
82 : "+m" (v->counter), "=qm" (c)
83 : "ir" (i) : "memory");
84 return c;
85} 81}
86 82
87/** 83/**
@@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v)
118 */ 114 */
119static inline int atomic_dec_and_test(atomic_t *v) 115static inline int atomic_dec_and_test(atomic_t *v)
120{ 116{
121 unsigned char c; 117 GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "+m" (v->counter), "=qm" (c)
125 : : "memory");
126 return c != 0;
127} 118}
128 119
129/** 120/**
@@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
136 */ 127 */
137static inline int atomic_inc_and_test(atomic_t *v) 128static inline int atomic_inc_and_test(atomic_t *v)
138{ 129{
139 unsigned char c; 130 GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "+m" (v->counter), "=qm" (c)
143 : : "memory");
144 return c != 0;
145} 131}
146 132
147/** 133/**
@@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
155 */ 141 */
156static inline int atomic_add_negative(int i, atomic_t *v) 142static inline int atomic_add_negative(int i, atomic_t *v)
157{ 143{
158 unsigned char c; 144 GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s");
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "+m" (v->counter), "=qm" (c)
162 : "ir" (i) : "memory");
163 return c;
164} 145}
165 146
166/** 147/**
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 0e1cbfc8ee06..3f065c985aee 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
72 */ 72 */
73static inline int atomic64_sub_and_test(long i, atomic64_t *v) 73static inline int atomic64_sub_and_test(long i, atomic64_t *v)
74{ 74{
75 unsigned char c; 75 GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e");
76
77 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
78 : "=m" (v->counter), "=qm" (c)
79 : "er" (i), "m" (v->counter) : "memory");
80 return c;
81} 76}
82 77
83/** 78/**
@@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v)
116 */ 111 */
117static inline int atomic64_dec_and_test(atomic64_t *v) 112static inline int atomic64_dec_and_test(atomic64_t *v)
118{ 113{
119 unsigned char c; 114 GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e");
120
121 asm volatile(LOCK_PREFIX "decq %0; sete %1"
122 : "=m" (v->counter), "=qm" (c)
123 : "m" (v->counter) : "memory");
124 return c != 0;
125} 115}
126 116
127/** 117/**
@@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v)
134 */ 124 */
135static inline int atomic64_inc_and_test(atomic64_t *v) 125static inline int atomic64_inc_and_test(atomic64_t *v)
136{ 126{
137 unsigned char c; 127 GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e");
138
139 asm volatile(LOCK_PREFIX "incq %0; sete %1"
140 : "=m" (v->counter), "=qm" (c)
141 : "m" (v->counter) : "memory");
142 return c != 0;
143} 128}
144 129
145/** 130/**
@@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v)
153 */ 138 */
154static inline int atomic64_add_negative(long i, atomic64_t *v) 139static inline int atomic64_add_negative(long i, atomic64_t *v)
155{ 140{
156 unsigned char c; 141 GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s");
157
158 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
159 : "=m" (v->counter), "=qm" (c)
160 : "er" (i), "m" (v->counter) : "memory");
161 return c;
162} 142}
163 143
164/** 144/**
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 41639ce8fd63..6d76d0935989 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -14,6 +14,7 @@
14 14
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <asm/alternative.h> 16#include <asm/alternative.h>
17#include <asm/rmwcc.h>
17 18
18#if BITS_PER_LONG == 32 19#if BITS_PER_LONG == 32
19# define _BITOPS_LONG_SHIFT 5 20# define _BITOPS_LONG_SHIFT 5
@@ -204,12 +205,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
204 */ 205 */
205static inline int test_and_set_bit(long nr, volatile unsigned long *addr) 206static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
206{ 207{
207 int oldbit; 208 GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c");
208
209 asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
210 "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
211
212 return oldbit;
213} 209}
214 210
215/** 211/**
@@ -255,13 +251,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
255 */ 251 */
256static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) 252static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
257{ 253{
258 int oldbit; 254 GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c");
259
260 asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
261 "sbb %0,%0"
262 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
263
264 return oldbit;
265} 255}
266 256
267/** 257/**
@@ -314,13 +304,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
314 */ 304 */
315static inline int test_and_change_bit(long nr, volatile unsigned long *addr) 305static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
316{ 306{
317 int oldbit; 307 GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c");
318
319 asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
320 "sbb %0,%0"
321 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
322
323 return oldbit;
324} 308}
325 309
326static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) 310static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0fa675033912..cb4c73bfeb48 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with
48 48
49#include <asm/dwarf2.h> 49#include <asm/dwarf2.h>
50 50
51#ifdef CONFIG_X86_64
52
51/* 53/*
52 * 64-bit system call stack frame layout defines and helpers, 54 * 64-bit system call stack frame layout defines and helpers,
53 * for assembly code: 55 * for assembly code:
@@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with
192 .macro icebp 194 .macro icebp
193 .byte 0xf1 195 .byte 0xf1
194 .endm 196 .endm
197
198#else /* CONFIG_X86_64 */
199
200/*
201 * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
202 * are different from the entry_32.S versions in not changing the segment
203 * registers. So only suitable for in kernel use, not when transitioning
204 * from or to user space. The resulting stack frame is not a standard
205 * pt_regs frame. The main use case is calling C code from assembler
206 * when all the registers need to be preserved.
207 */
208
209 .macro SAVE_ALL
210 pushl_cfi %eax
211 CFI_REL_OFFSET eax, 0
212 pushl_cfi %ebp
213 CFI_REL_OFFSET ebp, 0
214 pushl_cfi %edi
215 CFI_REL_OFFSET edi, 0
216 pushl_cfi %esi
217 CFI_REL_OFFSET esi, 0
218 pushl_cfi %edx
219 CFI_REL_OFFSET edx, 0
220 pushl_cfi %ecx
221 CFI_REL_OFFSET ecx, 0
222 pushl_cfi %ebx
223 CFI_REL_OFFSET ebx, 0
224 .endm
225
226 .macro RESTORE_ALL
227 popl_cfi %ebx
228 CFI_RESTORE ebx
229 popl_cfi %ecx
230 CFI_RESTORE ecx
231 popl_cfi %edx
232 CFI_RESTORE edx
233 popl_cfi %esi
234 CFI_RESTORE esi
235 popl_cfi %edi
236 CFI_RESTORE edi
237 popl_cfi %ebp
238 CFI_RESTORE ebp
239 popl_cfi %eax
240 CFI_RESTORE eax
241 .endm
242
243#endif /* CONFIG_X86_64 */
244
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 2d89e3980cbd..5b23e605e707 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l)
52 */ 52 */
53static inline int local_sub_and_test(long i, local_t *l) 53static inline int local_sub_and_test(long i, local_t *l)
54{ 54{
55 unsigned char c; 55 GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e");
56
57 asm volatile(_ASM_SUB "%2,%0; sete %1"
58 : "+m" (l->a.counter), "=qm" (c)
59 : "ir" (i) : "memory");
60 return c;
61} 56}
62 57
63/** 58/**
@@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l)
70 */ 65 */
71static inline int local_dec_and_test(local_t *l) 66static inline int local_dec_and_test(local_t *l)
72{ 67{
73 unsigned char c; 68 GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e");
74
75 asm volatile(_ASM_DEC "%0; sete %1"
76 : "+m" (l->a.counter), "=qm" (c)
77 : : "memory");
78 return c != 0;
79} 69}
80 70
81/** 71/**
@@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l)
88 */ 78 */
89static inline int local_inc_and_test(local_t *l) 79static inline int local_inc_and_test(local_t *l)
90{ 80{
91 unsigned char c; 81 GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e");
92
93 asm volatile(_ASM_INC "%0; sete %1"
94 : "+m" (l->a.counter), "=qm" (c)
95 : : "memory");
96 return c != 0;
97} 82}
98 83
99/** 84/**
@@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l)
107 */ 92 */
108static inline int local_add_negative(long i, local_t *l) 93static inline int local_add_negative(long i, local_t *l)
109{ 94{
110 unsigned char c; 95 GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s");
111
112 asm volatile(_ASM_ADD "%2,%0; sets %1"
113 : "+m" (l->a.counter), "=qm" (c)
114 : "ir" (i) : "memory");
115 return c;
116} 96}
117 97
118/** 98/**
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
new file mode 100644
index 000000000000..8729723636fd
--- /dev/null
+++ b/arch/x86/include/asm/preempt.h
@@ -0,0 +1,100 @@
1#ifndef __ASM_PREEMPT_H
2#define __ASM_PREEMPT_H
3
4#include <asm/rmwcc.h>
5#include <asm/percpu.h>
6#include <linux/thread_info.h>
7
8DECLARE_PER_CPU(int, __preempt_count);
9
10/*
11 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
12 * that think a non-zero value indicates we cannot preempt.
13 */
14static __always_inline int preempt_count(void)
15{
16 return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
17}
18
19static __always_inline void preempt_count_set(int pc)
20{
21 __this_cpu_write_4(__preempt_count, pc);
22}
23
24/*
25 * must be macros to avoid header recursion hell
26 */
27#define task_preempt_count(p) \
28 (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED)
29
30#define init_task_preempt_count(p) do { \
31 task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
32} while (0)
33
34#define init_idle_preempt_count(p, cpu) do { \
35 task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
36 per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
37} while (0)
38
39/*
40 * We fold the NEED_RESCHED bit into the preempt count such that
41 * preempt_enable() can decrement and test for needing to reschedule with a
42 * single instruction.
43 *
44 * We invert the actual bit, so that when the decrement hits 0 we know we both
45 * need to resched (the bit is cleared) and can resched (no preempt count).
46 */
47
48static __always_inline void set_preempt_need_resched(void)
49{
50 __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
51}
52
53static __always_inline void clear_preempt_need_resched(void)
54{
55 __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
56}
57
58static __always_inline bool test_preempt_need_resched(void)
59{
60 return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
61}
62
63/*
64 * The various preempt_count add/sub methods
65 */
66
67static __always_inline void __preempt_count_add(int val)
68{
69 __this_cpu_add_4(__preempt_count, val);
70}
71
72static __always_inline void __preempt_count_sub(int val)
73{
74 __this_cpu_add_4(__preempt_count, -val);
75}
76
77static __always_inline bool __preempt_count_dec_and_test(void)
78{
79 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
80}
81
82/*
83 * Returns true when we need to resched and can (barring IRQ state).
84 */
85static __always_inline bool should_resched(void)
86{
87 return unlikely(!__this_cpu_read_4(__preempt_count));
88}
89
90#ifdef CONFIG_PREEMPT
91 extern asmlinkage void ___preempt_schedule(void);
92# define __preempt_schedule() asm ("call ___preempt_schedule")
93 extern asmlinkage void preempt_schedule(void);
94# ifdef CONFIG_CONTEXT_TRACKING
95 extern asmlinkage void ___preempt_schedule_context(void);
96# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
97# endif
98#endif
99
100#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
new file mode 100644
index 000000000000..1ff990f1de8e
--- /dev/null
+++ b/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,41 @@
1#ifndef _ASM_X86_RMWcc
2#define _ASM_X86_RMWcc
3
4#ifdef CC_HAVE_ASM_GOTO
5
6#define __GEN_RMWcc(fullop, var, cc, ...) \
7do { \
8 asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \
9 : : "m" (var), ## __VA_ARGS__ \
10 : "memory" : cc_label); \
11 return 0; \
12cc_label: \
13 return 1; \
14} while (0)
15
16#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
17 __GEN_RMWcc(op " " arg0, var, cc)
18
19#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \
20 __GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val))
21
22#else /* !CC_HAVE_ASM_GOTO */
23
24#define __GEN_RMWcc(fullop, var, cc, ...) \
25do { \
26 char c; \
27 asm volatile (fullop "; set" cc " %1" \
28 : "+m" (var), "=qm" (c) \
29 : __VA_ARGS__ : "memory"); \
30 return c != 0; \
31} while (0)
32
33#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
34 __GEN_RMWcc(op " " arg0, var, cc)
35
36#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \
37 __GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val))
38
39#endif /* CC_HAVE_ASM_GOTO */
40
41#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 27811190cbd7..c46a46be1ec6 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -28,8 +28,7 @@ struct thread_info {
28 __u32 flags; /* low level flags */ 28 __u32 flags; /* low level flags */
29 __u32 status; /* thread synchronous flags */ 29 __u32 status; /* thread synchronous flags */
30 __u32 cpu; /* current CPU */ 30 __u32 cpu; /* current CPU */
31 int preempt_count; /* 0 => preemptable, 31 int saved_preempt_count;
32 <0 => BUG */
33 mm_segment_t addr_limit; 32 mm_segment_t addr_limit;
34 struct restart_block restart_block; 33 struct restart_block restart_block;
35 void __user *sysenter_return; 34 void __user *sysenter_return;
@@ -49,7 +48,7 @@ struct thread_info {
49 .exec_domain = &default_exec_domain, \ 48 .exec_domain = &default_exec_domain, \
50 .flags = 0, \ 49 .flags = 0, \
51 .cpu = 0, \ 50 .cpu = 0, \
52 .preempt_count = INIT_PREEMPT_COUNT, \ 51 .saved_preempt_count = INIT_PREEMPT_COUNT, \
53 .addr_limit = KERNEL_DS, \ 52 .addr_limit = KERNEL_DS, \
54 .restart_block = { \ 53 .restart_block = { \
55 .fn = do_no_restart_syscall, \ 54 .fn = do_no_restart_syscall, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a5408b965c9d..9b0a34e2cd79 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o
36obj-y += pci-iommu_table.o 36obj-y += pci-iommu_table.o
37obj-y += resource.o 37obj-y += resource.o
38 38
39obj-$(CONFIG_PREEMPT) += preempt.o
40
39obj-y += process.o 41obj-y += process.o
40obj-y += i387.o xsave.o 42obj-y += i387.o xsave.o
41obj-y += ptrace.o 43obj-y += ptrace.o
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 28610822fb3c..9f6b9341950f 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,6 @@ void common(void) {
32 OFFSET(TI_flags, thread_info, flags); 32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status); 33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit); 34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36 35
37 BLANK(); 36 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 37 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2793d1f095a2..5223fe6dec7b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1095 1095
1096DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; 1096DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
1097 1097
1098DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1099EXPORT_PER_CPU_SYMBOL(__preempt_count);
1100
1098DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1101DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1099 1102
1100/* 1103/*
@@ -1169,6 +1172,8 @@ void debug_stack_reset(void)
1169 1172
1170DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1173DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1171EXPORT_PER_CPU_SYMBOL(current_task); 1174EXPORT_PER_CPU_SYMBOL(current_task);
1175DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1176EXPORT_PER_CPU_SYMBOL(__preempt_count);
1172DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1177DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1173 1178
1174#ifdef CONFIG_CC_STACKPROTECTOR 1179#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0dcb0ceb6a2..fd1bc1b15e6d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -362,12 +362,9 @@ END(ret_from_exception)
362#ifdef CONFIG_PREEMPT 362#ifdef CONFIG_PREEMPT
363ENTRY(resume_kernel) 363ENTRY(resume_kernel)
364 DISABLE_INTERRUPTS(CLBR_ANY) 364 DISABLE_INTERRUPTS(CLBR_ANY)
365 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
366 jnz restore_all
367need_resched: 365need_resched:
368 movl TI_flags(%ebp), %ecx # need_resched set ? 366 cmpl $0,PER_CPU_VAR(__preempt_count)
369 testb $_TIF_NEED_RESCHED, %cl 367 jnz restore_all
370 jz restore_all
371 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? 368 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
372 jz restore_all 369 jz restore_all
373 call preempt_schedule_irq 370 call preempt_schedule_irq
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b077f4cc225a..1a2cc64abcd7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1103,10 +1103,8 @@ retint_signal:
1103 /* Returning to kernel space. Check if we need preemption */ 1103 /* Returning to kernel space. Check if we need preemption */
1104 /* rcx: threadinfo. interrupts off. */ 1104 /* rcx: threadinfo. interrupts off. */
1105ENTRY(retint_kernel) 1105ENTRY(retint_kernel)
1106 cmpl $0,TI_preempt_count(%rcx) 1106 cmpl $0,PER_CPU_VAR(__preempt_count)
1107 jnz retint_restore_args 1107 jnz retint_restore_args
1108 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
1109 jnc retint_restore_args
1110 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 1108 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
1111 jnc retint_restore_args 1109 jnc retint_restore_args
1112 call preempt_schedule_irq 1110 call preempt_schedule_irq
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 0fa69127209a..05fd74f537d6 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr);
37 37
38EXPORT_SYMBOL(csum_partial); 38EXPORT_SYMBOL(csum_partial);
39EXPORT_SYMBOL(empty_zero_page); 39EXPORT_SYMBOL(empty_zero_page);
40
41#ifdef CONFIG_PREEMPT
42EXPORT_SYMBOL(___preempt_schedule);
43#ifdef CONFIG_CONTEXT_TRACKING
44EXPORT_SYMBOL(___preempt_schedule_context);
45#endif
46#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 4186755f1d7c..3fe066359ac0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
100 irqctx->tinfo.task = curctx->tinfo.task; 100 irqctx->tinfo.task = curctx->tinfo.task;
101 irqctx->tinfo.previous_esp = current_stack_pointer; 101 irqctx->tinfo.previous_esp = current_stack_pointer;
102 102
103 /* Copy the preempt_count so that the [soft]irq checks work. */
104 irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
105
106 if (unlikely(overflow)) 103 if (unlikely(overflow))
107 call_on_stack(print_stack_overflow, isp); 104 call_on_stack(print_stack_overflow, isp);
108 105
@@ -131,7 +128,6 @@ void irq_ctx_init(int cpu)
131 THREAD_SIZE_ORDER)); 128 THREAD_SIZE_ORDER));
132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 129 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.cpu = cpu; 130 irqctx->tinfo.cpu = cpu;
134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 131 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
136 132
137 per_cpu(hardirq_ctx, cpu) = irqctx; 133 per_cpu(hardirq_ctx, cpu) = irqctx;
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
new file mode 100644
index 000000000000..ca7f0d58a87d
--- /dev/null
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
1
2#include <linux/linkage.h>
3#include <asm/dwarf2.h>
4#include <asm/asm.h>
5#include <asm/calling.h>
6
7ENTRY(___preempt_schedule)
8 CFI_STARTPROC
9 SAVE_ALL
10 call preempt_schedule
11 RESTORE_ALL
12 ret
13 CFI_ENDPROC
14
15#ifdef CONFIG_CONTEXT_TRACKING
16
17ENTRY(___preempt_schedule_context)
18 CFI_STARTPROC
19 SAVE_ALL
20 call preempt_schedule_context
21 RESTORE_ALL
22 ret
23 CFI_ENDPROC
24
25#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c83516be1052..3fb8d95ab8b5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -391,9 +391,9 @@ static void amd_e400_idle(void)
391 * The switch back from broadcast mode needs to be 391 * The switch back from broadcast mode needs to be
392 * called with interrupts disabled. 392 * called with interrupts disabled.
393 */ 393 */
394 local_irq_disable(); 394 local_irq_disable();
395 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 395 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
396 local_irq_enable(); 396 local_irq_enable();
397 } else 397 } else
398 default_idle(); 398 default_idle();
399} 399}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 884f98f69354..c2ec1aa6d454 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -292,6 +292,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
292 set_iopl_mask(next->iopl); 292 set_iopl_mask(next->iopl);
293 293
294 /* 294 /*
295 * If it were not for PREEMPT_ACTIVE we could guarantee that the
296 * preempt_count of all tasks was equal here and this would not be
297 * needed.
298 */
299 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
300 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
301
302 /*
295 * Now maybe handle debug registers and/or IO bitmaps 303 * Now maybe handle debug registers and/or IO bitmaps
296 */ 304 */
297 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || 305 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bb1dc51bab05..45ab4d6fc8a7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
363 this_cpu_write(old_rsp, next->usersp); 363 this_cpu_write(old_rsp, next->usersp);
364 this_cpu_write(current_task, next_p); 364 this_cpu_write(current_task, next_p);
365 365
366 /*
367 * If it were not for PREEMPT_ACTIVE we could guarantee that the
368 * preempt_count of all tasks was equal here and this would not be
369 * needed.
370 */
371 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
372 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
373
366 this_cpu_write(kernel_stack, 374 this_cpu_write(kernel_stack,
367 (unsigned long)task_stack_page(next_p) + 375 (unsigned long)task_stack_page(next_p) +
368 THREAD_SIZE - KERNEL_STACK_OFFSET); 376 THREAD_SIZE - KERNEL_STACK_OFFSET);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8c8093b146ca..729aa779ff75 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs)
88 88
89static inline void preempt_conditional_sti(struct pt_regs *regs) 89static inline void preempt_conditional_sti(struct pt_regs *regs)
90{ 90{
91 inc_preempt_count(); 91 preempt_count_inc();
92 if (regs->flags & X86_EFLAGS_IF) 92 if (regs->flags & X86_EFLAGS_IF)
93 local_irq_enable(); 93 local_irq_enable();
94} 94}
@@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
103{ 103{
104 if (regs->flags & X86_EFLAGS_IF) 104 if (regs->flags & X86_EFLAGS_IF)
105 local_irq_disable(); 105 local_irq_disable();
106 dec_preempt_count(); 106 preempt_count_dec();
107} 107}
108 108
109static int __kprobes 109static int __kprobes
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index b014d9414d08..040681928e9d 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page);
66#ifndef CONFIG_PARAVIRT 66#ifndef CONFIG_PARAVIRT
67EXPORT_SYMBOL(native_load_gs_index); 67EXPORT_SYMBOL(native_load_gs_index);
68#endif 68#endif
69
70#ifdef CONFIG_PREEMPT
71EXPORT_SYMBOL(___preempt_schedule);
72#ifdef CONFIG_CONTEXT_TRACKING
73EXPORT_SYMBOL(___preempt_schedule_context);
74#endif
75#endif
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 1b982641ec35..228d6aee3a16 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -28,3 +28,4 @@ generic-y += termios.h
28generic-y += topology.h 28generic-y += topology.h
29generic-y += trace_clock.h 29generic-y += trace_clock.h
30generic-y += xor.h 30generic-y += xor.h
31generic-y += preempt.h
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index f98dd00b51a9..c7414a545a4f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = {
119 */ 119 */
120static void acpi_safe_halt(void) 120static void acpi_safe_halt(void)
121{ 121{
122 current_thread_info()->status &= ~TS_POLLING; 122 if (!tif_need_resched()) {
123 /*
124 * TS_POLLING-cleared state must be visible before we
125 * test NEED_RESCHED:
126 */
127 smp_mb();
128 if (!need_resched()) {
129 safe_halt(); 123 safe_halt();
130 local_irq_disable(); 124 local_irq_disable();
131 } 125 }
132 current_thread_info()->status |= TS_POLLING;
133} 126}
134 127
135#ifdef ARCH_APICTIMER_STOPS_ON_C3 128#ifdef ARCH_APICTIMER_STOPS_ON_C3
@@ -737,6 +730,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
737 if (unlikely(!pr)) 730 if (unlikely(!pr))
738 return -EINVAL; 731 return -EINVAL;
739 732
733 if (cx->entry_method == ACPI_CSTATE_FFH) {
734 if (current_set_polling_and_test())
735 return -EINVAL;
736 }
737
740 lapic_timer_state_broadcast(pr, cx, 1); 738 lapic_timer_state_broadcast(pr, cx, 1);
741 acpi_idle_do_entry(cx); 739 acpi_idle_do_entry(cx);
742 740
@@ -790,18 +788,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
790 if (unlikely(!pr)) 788 if (unlikely(!pr))
791 return -EINVAL; 789 return -EINVAL;
792 790
793 if (cx->entry_method != ACPI_CSTATE_FFH) { 791 if (cx->entry_method == ACPI_CSTATE_FFH) {
794 current_thread_info()->status &= ~TS_POLLING; 792 if (current_set_polling_and_test())
795 /*
796 * TS_POLLING-cleared state must be visible before we test
797 * NEED_RESCHED:
798 */
799 smp_mb();
800
801 if (unlikely(need_resched())) {
802 current_thread_info()->status |= TS_POLLING;
803 return -EINVAL; 793 return -EINVAL;
804 }
805 } 794 }
806 795
807 /* 796 /*
@@ -819,9 +808,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
819 808
820 sched_clock_idle_wakeup_event(0); 809 sched_clock_idle_wakeup_event(0);
821 810
822 if (cx->entry_method != ACPI_CSTATE_FFH)
823 current_thread_info()->status |= TS_POLLING;
824
825 lapic_timer_state_broadcast(pr, cx, 0); 811 lapic_timer_state_broadcast(pr, cx, 0);
826 return index; 812 return index;
827} 813}
@@ -858,18 +844,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
858 } 844 }
859 } 845 }
860 846
861 if (cx->entry_method != ACPI_CSTATE_FFH) { 847 if (cx->entry_method == ACPI_CSTATE_FFH) {
862 current_thread_info()->status &= ~TS_POLLING; 848 if (current_set_polling_and_test())
863 /*
864 * TS_POLLING-cleared state must be visible before we test
865 * NEED_RESCHED:
866 */
867 smp_mb();
868
869 if (unlikely(need_resched())) {
870 current_thread_info()->status |= TS_POLLING;
871 return -EINVAL; 849 return -EINVAL;
872 }
873 } 850 }
874 851
875 acpi_unlazy_tlb(smp_processor_id()); 852 acpi_unlazy_tlb(smp_processor_id());
@@ -915,9 +892,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
915 892
916 sched_clock_idle_wakeup_event(0); 893 sched_clock_idle_wakeup_event(0);
917 894
918 if (cx->entry_method != ACPI_CSTATE_FFH)
919 current_thread_info()->status |= TS_POLLING;
920
921 lapic_timer_state_broadcast(pr, cx, 0); 895 lapic_timer_state_broadcast(pr, cx, 0);
922 return index; 896 return index;
923} 897}
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index fa6964d8681a..f116d664b473 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -359,7 +359,7 @@ static int intel_idle(struct cpuidle_device *dev,
359 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 359 if (!(lapic_timer_reliable_states & (1 << (cstate))))
360 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 360 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
361 361
362 if (!need_resched()) { 362 if (!current_set_polling_and_test()) {
363 363
364 __monitor((void *)&current_thread_info()->flags, 0, 0); 364 __monitor((void *)&current_thread_info()->flags, 0, 0);
365 smp_mb(); 365 smp_mb();
diff --git a/fs/exec.c b/fs/exec.c
index 8875dd10ae7a..2ea437e5acf4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename,
1547 current->fs->in_exec = 0; 1547 current->fs->in_exec = 0;
1548 current->in_execve = 0; 1548 current->in_execve = 0;
1549 acct_update_integrals(current); 1549 acct_update_integrals(current);
1550 task_numa_free(current);
1550 free_bprm(bprm); 1551 free_bprm(bprm);
1551 if (displaced) 1552 if (displaced)
1552 put_files_struct(displaced); 1553 put_files_struct(displaced);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cbd0f1b324b9..1bd2077187fd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
183 seq_printf(m, 183 seq_printf(m,
184 "State:\t%s\n" 184 "State:\t%s\n"
185 "Tgid:\t%d\n" 185 "Tgid:\t%d\n"
186 "Ngid:\t%d\n"
186 "Pid:\t%d\n" 187 "Pid:\t%d\n"
187 "PPid:\t%d\n" 188 "PPid:\t%d\n"
188 "TracerPid:\t%d\n" 189 "TracerPid:\t%d\n"
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
190 "Gid:\t%d\t%d\t%d\t%d\n", 191 "Gid:\t%d\t%d\t%d\t%d\n",
191 get_task_state(p), 192 get_task_state(p),
192 task_tgid_nr_ns(p, ns), 193 task_tgid_nr_ns(p, ns),
194 task_numa_group_id(p),
193 pid_nr_ns(pid, ns), 195 pid_nr_ns(pid, ns),
194 ppid, tpid, 196 ppid, tpid,
195 from_kuid_munged(user_ns, cred->uid), 197 from_kuid_munged(user_ns, cred->uid),
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
new file mode 100644
index 000000000000..ddf2b420ac8f
--- /dev/null
+++ b/include/asm-generic/preempt.h
@@ -0,0 +1,105 @@
1#ifndef __ASM_PREEMPT_H
2#define __ASM_PREEMPT_H
3
4#include <linux/thread_info.h>
5
6/*
7 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
8 * that think a non-zero value indicates we cannot preempt.
9 */
10static __always_inline int preempt_count(void)
11{
12 return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
13}
14
15static __always_inline int *preempt_count_ptr(void)
16{
17 return &current_thread_info()->preempt_count;
18}
19
20/*
21 * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
22 * alternative is loosing a reschedule. Better schedule too often -- also this
23 * should be a very rare operation.
24 */
25static __always_inline void preempt_count_set(int pc)
26{
27 *preempt_count_ptr() = pc;
28}
29
30/*
31 * must be macros to avoid header recursion hell
32 */
33#define task_preempt_count(p) \
34 (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED)
35
36#define init_task_preempt_count(p) do { \
37 task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
38} while (0)
39
40#define init_idle_preempt_count(p, cpu) do { \
41 task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
42} while (0)
43
44/*
45 * We fold the NEED_RESCHED bit into the preempt count such that
46 * preempt_enable() can decrement and test for needing to reschedule with a
47 * single instruction.
48 *
49 * We invert the actual bit, so that when the decrement hits 0 we know we both
50 * need to resched (the bit is cleared) and can resched (no preempt count).
51 */
52
53static __always_inline void set_preempt_need_resched(void)
54{
55 *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
56}
57
58static __always_inline void clear_preempt_need_resched(void)
59{
60 *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
61}
62
63static __always_inline bool test_preempt_need_resched(void)
64{
65 return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
66}
67
68/*
69 * The various preempt_count add/sub methods
70 */
71
72static __always_inline void __preempt_count_add(int val)
73{
74 *preempt_count_ptr() += val;
75}
76
77static __always_inline void __preempt_count_sub(int val)
78{
79 *preempt_count_ptr() -= val;
80}
81
82static __always_inline bool __preempt_count_dec_and_test(void)
83{
84 return !--*preempt_count_ptr();
85}
86
87/*
88 * Returns true when we need to resched and can (barring IRQ state).
89 */
90static __always_inline bool should_resched(void)
91{
92 return unlikely(!*preempt_count_ptr());
93}
94
95#ifdef CONFIG_PREEMPT
96extern asmlinkage void preempt_schedule(void);
97#define __preempt_schedule() preempt_schedule()
98
99#ifdef CONFIG_CONTEXT_TRACKING
100extern asmlinkage void preempt_schedule_context(void);
101#define __preempt_schedule_context() preempt_schedule_context()
102#endif
103#endif /* CONFIG_PREEMPT */
104
105#endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 1e041063b226..d9cf963ac832 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void);
33#define __irq_enter() \ 33#define __irq_enter() \
34 do { \ 34 do { \
35 account_irq_enter_time(current); \ 35 account_irq_enter_time(current); \
36 add_preempt_count(HARDIRQ_OFFSET); \ 36 preempt_count_add(HARDIRQ_OFFSET); \
37 trace_hardirq_enter(); \ 37 trace_hardirq_enter(); \
38 } while (0) 38 } while (0)
39 39
@@ -49,7 +49,7 @@ extern void irq_enter(void);
49 do { \ 49 do { \
50 trace_hardirq_exit(); \ 50 trace_hardirq_exit(); \
51 account_irq_exit_time(current); \ 51 account_irq_exit_time(current); \
52 sub_preempt_count(HARDIRQ_OFFSET); \ 52 preempt_count_sub(HARDIRQ_OFFSET); \
53 } while (0) 53 } while (0)
54 54
55/* 55/*
@@ -62,7 +62,7 @@ extern void irq_exit(void);
62 lockdep_off(); \ 62 lockdep_off(); \
63 ftrace_nmi_enter(); \ 63 ftrace_nmi_enter(); \
64 BUG_ON(in_nmi()); \ 64 BUG_ON(in_nmi()); \
65 add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ 65 preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
66 rcu_nmi_enter(); \ 66 rcu_nmi_enter(); \
67 trace_hardirq_enter(); \ 67 trace_hardirq_enter(); \
68 } while (0) 68 } while (0)
@@ -72,7 +72,7 @@ extern void irq_exit(void);
72 trace_hardirq_exit(); \ 72 trace_hardirq_exit(); \
73 rcu_nmi_exit(); \ 73 rcu_nmi_exit(); \
74 BUG_ON(!in_nmi()); \ 74 BUG_ON(!in_nmi()); \
75 sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ 75 preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
76 ftrace_nmi_exit(); \ 76 ftrace_nmi_exit(); \
77 lockdep_on(); \ 77 lockdep_on(); \
78 } while (0) 78 } while (0)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index da6716b9e3fe..ea4d2495c646 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
136 136
137struct mempolicy *get_vma_policy(struct task_struct *tsk, 137struct mempolicy *get_vma_policy(struct task_struct *tsk,
138 struct vm_area_struct *vma, unsigned long addr); 138 struct vm_area_struct *vma, unsigned long addr);
139bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma);
139 140
140extern void numa_default_policy(void); 141extern void numa_default_policy(void);
141extern void numa_policy_init(void); 142extern void numa_policy_init(void);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8d3c57fdf221..f5096b58b20d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
90#endif /* CONFIG_MIGRATION */ 90#endif /* CONFIG_MIGRATION */
91 91
92#ifdef CONFIG_NUMA_BALANCING 92#ifdef CONFIG_NUMA_BALANCING
93extern int migrate_misplaced_page(struct page *page, int node); 93extern int migrate_misplaced_page(struct page *page,
94extern int migrate_misplaced_page(struct page *page, int node); 94 struct vm_area_struct *vma, int node);
95extern bool migrate_ratelimited(int node); 95extern bool migrate_ratelimited(int node);
96#else 96#else
97static inline int migrate_misplaced_page(struct page *page, int node) 97static inline int migrate_misplaced_page(struct page *page,
98 struct vm_area_struct *vma, int node)
98{ 99{
99 return -EAGAIN; /* can't migrate now */ 100 return -EAGAIN; /* can't migrate now */
100} 101}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55ee8855..81443d557a2e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
581 * sets it, so none of the operations on it need to be atomic. 581 * sets it, so none of the operations on it need to be atomic.
582 */ 582 */
583 583
584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ 584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) 585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) 586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) 587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
588#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) 588#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
589 589
590/* 590/*
591 * Define the bit shifts to access each section. For non-existent 591 * Define the bit shifts to access each section. For non-existent
@@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) 595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) 596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) 597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
598#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) 598#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
599 599
600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ 600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
601#ifdef NODE_NOT_IN_PAGE_FLAGS 601#ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) 617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
618#define NODES_MASK ((1UL << NODES_WIDTH) - 1) 618#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
620#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) 620#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1)
621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) 621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
622 622
623static inline enum zone_type page_zonenum(const struct page *page) 623static inline enum zone_type page_zonenum(const struct page *page)
@@ -661,51 +661,117 @@ static inline int page_to_nid(const struct page *page)
661#endif 661#endif
662 662
663#ifdef CONFIG_NUMA_BALANCING 663#ifdef CONFIG_NUMA_BALANCING
664#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 664static inline int cpu_pid_to_cpupid(int cpu, int pid)
665static inline int page_nid_xchg_last(struct page *page, int nid)
666{ 665{
667 return xchg(&page->_last_nid, nid); 666 return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
668} 667}
669 668
670static inline int page_nid_last(struct page *page) 669static inline int cpupid_to_pid(int cpupid)
671{ 670{
672 return page->_last_nid; 671 return cpupid & LAST__PID_MASK;
673} 672}
674static inline void page_nid_reset_last(struct page *page) 673
674static inline int cpupid_to_cpu(int cpupid)
675{ 675{
676 page->_last_nid = -1; 676 return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
677} 677}
678#else 678
679static inline int page_nid_last(struct page *page) 679static inline int cpupid_to_nid(int cpupid)
680{ 680{
681 return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; 681 return cpu_to_node(cpupid_to_cpu(cpupid));
682} 682}
683 683
684extern int page_nid_xchg_last(struct page *page, int nid); 684static inline bool cpupid_pid_unset(int cpupid)
685{
686 return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
687}
685 688
686static inline void page_nid_reset_last(struct page *page) 689static inline bool cpupid_cpu_unset(int cpupid)
687{ 690{
688 int nid = (1 << LAST_NID_SHIFT) - 1; 691 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
692}
689 693
690 page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 694static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
691 page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 695{
696 return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
697}
698
699#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
700#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
701static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
702{
703 return xchg(&page->_last_cpupid, cpupid);
704}
705
706static inline int page_cpupid_last(struct page *page)
707{
708 return page->_last_cpupid;
709}
710static inline void page_cpupid_reset_last(struct page *page)
711{
712 page->_last_cpupid = -1;
692} 713}
693#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
694#else 714#else
695static inline int page_nid_xchg_last(struct page *page, int nid) 715static inline int page_cpupid_last(struct page *page)
696{ 716{
697 return page_to_nid(page); 717 return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
698} 718}
699 719
700static inline int page_nid_last(struct page *page) 720extern int page_cpupid_xchg_last(struct page *page, int cpupid);
721
722static inline void page_cpupid_reset_last(struct page *page)
701{ 723{
702 return page_to_nid(page); 724 int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
725
726 page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
727 page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
728}
729#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
730#else /* !CONFIG_NUMA_BALANCING */
731static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
732{
733 return page_to_nid(page); /* XXX */
703} 734}
704 735
705static inline void page_nid_reset_last(struct page *page) 736static inline int page_cpupid_last(struct page *page)
706{ 737{
738 return page_to_nid(page); /* XXX */
707} 739}
708#endif 740
741static inline int cpupid_to_nid(int cpupid)
742{
743 return -1;
744}
745
746static inline int cpupid_to_pid(int cpupid)
747{
748 return -1;
749}
750
751static inline int cpupid_to_cpu(int cpupid)
752{
753 return -1;
754}
755
756static inline int cpu_pid_to_cpupid(int nid, int pid)
757{
758 return -1;
759}
760
761static inline bool cpupid_pid_unset(int cpupid)
762{
763 return 1;
764}
765
766static inline void page_cpupid_reset_last(struct page *page)
767{
768}
769
770static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
771{
772 return false;
773}
774#endif /* CONFIG_NUMA_BALANCING */
709 775
710static inline struct zone *page_zone(const struct page *page) 776static inline struct zone *page_zone(const struct page *page)
711{ 777{
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d9851eeb6e1d..a3198e5aaf4e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -174,8 +174,8 @@ struct page {
174 void *shadow; 174 void *shadow;
175#endif 175#endif
176 176
177#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 177#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
178 int _last_nid; 178 int _last_cpupid;
179#endif 179#endif
180} 180}
181/* 181/*
@@ -420,28 +420,15 @@ struct mm_struct {
420 */ 420 */
421 unsigned long numa_next_scan; 421 unsigned long numa_next_scan;
422 422
423 /* numa_next_reset is when the PTE scanner period will be reset */
424 unsigned long numa_next_reset;
425
426 /* Restart point for scanning and setting pte_numa */ 423 /* Restart point for scanning and setting pte_numa */
427 unsigned long numa_scan_offset; 424 unsigned long numa_scan_offset;
428 425
429 /* numa_scan_seq prevents two threads setting pte_numa */ 426 /* numa_scan_seq prevents two threads setting pte_numa */
430 int numa_scan_seq; 427 int numa_scan_seq;
431
432 /*
433 * The first node a task was scheduled on. If a task runs on
434 * a different node than Make PTE Scan Go Now.
435 */
436 int first_nid;
437#endif 428#endif
438 struct uprobes_state uprobes_state; 429 struct uprobes_state uprobes_state;
439}; 430};
440 431
441/* first nid will either be a valid NID or one of these values */
442#define NUMA_PTE_SCAN_INIT -1
443#define NUMA_PTE_SCAN_ACTIVE -2
444
445static inline void mm_init_cpumask(struct mm_struct *mm) 432static inline void mm_init_cpumask(struct mm_struct *mm)
446{ 433{
447#ifdef CONFIG_CPUMASK_OFFSTACK 434#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 93506a114034..da523661500a 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -38,10 +38,10 @@
38 * The last is when there is insufficient space in page->flags and a separate 38 * The last is when there is insufficient space in page->flags and a separate
39 * lookup is necessary. 39 * lookup is necessary.
40 * 40 *
41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | 41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
42 * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | 42 * " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS |
43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | 43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
44 * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | 44 * " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | 45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
46 */ 46 */
47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -62,15 +62,21 @@
62#endif 62#endif
63 63
64#ifdef CONFIG_NUMA_BALANCING 64#ifdef CONFIG_NUMA_BALANCING
65#define LAST_NID_SHIFT NODES_SHIFT 65#define LAST__PID_SHIFT 8
66#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1)
67
68#define LAST__CPU_SHIFT NR_CPUS_BITS
69#define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1)
70
71#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT)
66#else 72#else
67#define LAST_NID_SHIFT 0 73#define LAST_CPUPID_SHIFT 0
68#endif 74#endif
69 75
70#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS 76#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
71#define LAST_NID_WIDTH LAST_NID_SHIFT 77#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
72#else 78#else
73#define LAST_NID_WIDTH 0 79#define LAST_CPUPID_WIDTH 0
74#endif 80#endif
75 81
76/* 82/*
@@ -81,8 +87,8 @@
81#define NODE_NOT_IN_PAGE_FLAGS 87#define NODE_NOT_IN_PAGE_FLAGS
82#endif 88#endif
83 89
84#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 90#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0
85#define LAST_NID_NOT_IN_PAGE_FLAGS 91#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
86#endif 92#endif
87 93
88#endif /* _LINUX_PAGE_FLAGS_LAYOUT */ 94#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f5d4723cdb3d..a3d9dc8c2c00 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,106 +6,95 @@
6 * preempt_count (used for kernel preemption, interrupt count, etc.) 6 * preempt_count (used for kernel preemption, interrupt count, etc.)
7 */ 7 */
8 8
9#include <linux/thread_info.h>
10#include <linux/linkage.h> 9#include <linux/linkage.h>
11#include <linux/list.h> 10#include <linux/list.h>
12 11
13#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) 12/*
14 extern void add_preempt_count(int val); 13 * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
15 extern void sub_preempt_count(int val); 14 * the other bits -- can't include that header due to inclusion hell.
16#else 15 */
17# define add_preempt_count(val) do { preempt_count() += (val); } while (0) 16#define PREEMPT_NEED_RESCHED 0x80000000
18# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0)
19#endif
20
21#define inc_preempt_count() add_preempt_count(1)
22#define dec_preempt_count() sub_preempt_count(1)
23
24#define preempt_count() (current_thread_info()->preempt_count)
25
26#ifdef CONFIG_PREEMPT
27
28asmlinkage void preempt_schedule(void);
29
30#define preempt_check_resched() \
31do { \
32 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
33 preempt_schedule(); \
34} while (0)
35
36#ifdef CONFIG_CONTEXT_TRACKING
37 17
38void preempt_schedule_context(void); 18#include <asm/preempt.h>
39 19
40#define preempt_check_resched_context() \ 20#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
41do { \ 21extern void preempt_count_add(int val);
42 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ 22extern void preempt_count_sub(int val);
43 preempt_schedule_context(); \ 23#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
44} while (0)
45#else 24#else
25#define preempt_count_add(val) __preempt_count_add(val)
26#define preempt_count_sub(val) __preempt_count_sub(val)
27#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
28#endif
46 29
47#define preempt_check_resched_context() preempt_check_resched() 30#define __preempt_count_inc() __preempt_count_add(1)
48 31#define __preempt_count_dec() __preempt_count_sub(1)
49#endif /* CONFIG_CONTEXT_TRACKING */
50
51#else /* !CONFIG_PREEMPT */
52
53#define preempt_check_resched() do { } while (0)
54#define preempt_check_resched_context() do { } while (0)
55
56#endif /* CONFIG_PREEMPT */
57 32
33#define preempt_count_inc() preempt_count_add(1)
34#define preempt_count_dec() preempt_count_sub(1)
58 35
59#ifdef CONFIG_PREEMPT_COUNT 36#ifdef CONFIG_PREEMPT_COUNT
60 37
61#define preempt_disable() \ 38#define preempt_disable() \
62do { \ 39do { \
63 inc_preempt_count(); \ 40 preempt_count_inc(); \
64 barrier(); \ 41 barrier(); \
65} while (0) 42} while (0)
66 43
67#define sched_preempt_enable_no_resched() \ 44#define sched_preempt_enable_no_resched() \
68do { \ 45do { \
69 barrier(); \ 46 barrier(); \
70 dec_preempt_count(); \ 47 preempt_count_dec(); \
71} while (0) 48} while (0)
72 49
73#define preempt_enable_no_resched() sched_preempt_enable_no_resched() 50#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
74 51
52#ifdef CONFIG_PREEMPT
75#define preempt_enable() \ 53#define preempt_enable() \
76do { \ 54do { \
77 preempt_enable_no_resched(); \
78 barrier(); \ 55 barrier(); \
79 preempt_check_resched(); \ 56 if (unlikely(preempt_count_dec_and_test())) \
57 __preempt_schedule(); \
58} while (0)
59
60#define preempt_check_resched() \
61do { \
62 if (should_resched()) \
63 __preempt_schedule(); \
80} while (0) 64} while (0)
81 65
82/* For debugging and tracer internals only! */ 66#else
83#define add_preempt_count_notrace(val) \ 67#define preempt_enable() preempt_enable_no_resched()
84 do { preempt_count() += (val); } while (0) 68#define preempt_check_resched() do { } while (0)
85#define sub_preempt_count_notrace(val) \ 69#endif
86 do { preempt_count() -= (val); } while (0)
87#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
88#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
89 70
90#define preempt_disable_notrace() \ 71#define preempt_disable_notrace() \
91do { \ 72do { \
92 inc_preempt_count_notrace(); \ 73 __preempt_count_inc(); \
93 barrier(); \ 74 barrier(); \
94} while (0) 75} while (0)
95 76
96#define preempt_enable_no_resched_notrace() \ 77#define preempt_enable_no_resched_notrace() \
97do { \ 78do { \
98 barrier(); \ 79 barrier(); \
99 dec_preempt_count_notrace(); \ 80 __preempt_count_dec(); \
100} while (0) 81} while (0)
101 82
102/* preempt_check_resched is OK to trace */ 83#ifdef CONFIG_PREEMPT
84
85#ifndef CONFIG_CONTEXT_TRACKING
86#define __preempt_schedule_context() __preempt_schedule()
87#endif
88
103#define preempt_enable_notrace() \ 89#define preempt_enable_notrace() \
104do { \ 90do { \
105 preempt_enable_no_resched_notrace(); \
106 barrier(); \ 91 barrier(); \
107 preempt_check_resched_context(); \ 92 if (unlikely(__preempt_count_dec_and_test())) \
93 __preempt_schedule_context(); \
108} while (0) 94} while (0)
95#else
96#define preempt_enable_notrace() preempt_enable_no_resched_notrace()
97#endif
109 98
110#else /* !CONFIG_PREEMPT_COUNT */ 99#else /* !CONFIG_PREEMPT_COUNT */
111 100
@@ -115,10 +104,11 @@ do { \
115 * that can cause faults and scheduling migrate into our preempt-protected 104 * that can cause faults and scheduling migrate into our preempt-protected
116 * region. 105 * region.
117 */ 106 */
118#define preempt_disable() barrier() 107#define preempt_disable() barrier()
119#define sched_preempt_enable_no_resched() barrier() 108#define sched_preempt_enable_no_resched() barrier()
120#define preempt_enable_no_resched() barrier() 109#define preempt_enable_no_resched() barrier()
121#define preempt_enable() barrier() 110#define preempt_enable() barrier()
111#define preempt_check_resched() do { } while (0)
122 112
123#define preempt_disable_notrace() barrier() 113#define preempt_disable_notrace() barrier()
124#define preempt_enable_no_resched_notrace() barrier() 114#define preempt_enable_no_resched_notrace() barrier()
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e27baeeda3f4..045b0d227846 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,6 +22,7 @@ struct sched_param {
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/nodemask.h> 23#include <linux/nodemask.h>
24#include <linux/mm_types.h> 24#include <linux/mm_types.h>
25#include <linux/preempt.h>
25 26
26#include <asm/page.h> 27#include <asm/page.h>
27#include <asm/ptrace.h> 28#include <asm/ptrace.h>
@@ -427,6 +428,14 @@ struct task_cputime {
427 .sum_exec_runtime = 0, \ 428 .sum_exec_runtime = 0, \
428 } 429 }
429 430
431#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
432
433#ifdef CONFIG_PREEMPT_COUNT
434#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
435#else
436#define PREEMPT_DISABLED PREEMPT_ENABLED
437#endif
438
430/* 439/*
431 * Disable preemption until the scheduler is running. 440 * Disable preemption until the scheduler is running.
432 * Reset by start_kernel()->sched_init()->init_idle(). 441 * Reset by start_kernel()->sched_init()->init_idle().
@@ -434,7 +443,7 @@ struct task_cputime {
434 * We include PREEMPT_ACTIVE to avoid cond_resched() from working 443 * We include PREEMPT_ACTIVE to avoid cond_resched() from working
435 * before the scheduler is active -- see should_resched(). 444 * before the scheduler is active -- see should_resched().
436 */ 445 */
437#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) 446#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
438 447
439/** 448/**
440 * struct thread_group_cputimer - thread group interval timer counts 449 * struct thread_group_cputimer - thread group interval timer counts
@@ -768,6 +777,7 @@ enum cpu_idle_type {
768#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 777#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
769#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 778#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
770#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 779#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
780#define SD_NUMA 0x4000 /* cross-node balancing */
771 781
772extern int __weak arch_sd_sibiling_asym_packing(void); 782extern int __weak arch_sd_sibiling_asym_packing(void);
773 783
@@ -811,6 +821,10 @@ struct sched_domain {
811 821
812 u64 last_update; 822 u64 last_update;
813 823
824 /* idle_balance() stats */
825 u64 max_newidle_lb_cost;
826 unsigned long next_decay_max_lb_cost;
827
814#ifdef CONFIG_SCHEDSTATS 828#ifdef CONFIG_SCHEDSTATS
815 /* load_balance() stats */ 829 /* load_balance() stats */
816 unsigned int lb_count[CPU_MAX_IDLE_TYPES]; 830 unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1029,6 +1043,8 @@ struct task_struct {
1029 struct task_struct *last_wakee; 1043 struct task_struct *last_wakee;
1030 unsigned long wakee_flips; 1044 unsigned long wakee_flips;
1031 unsigned long wakee_flip_decay_ts; 1045 unsigned long wakee_flip_decay_ts;
1046
1047 int wake_cpu;
1032#endif 1048#endif
1033 int on_rq; 1049 int on_rq;
1034 1050
@@ -1324,10 +1340,41 @@ struct task_struct {
1324#endif 1340#endif
1325#ifdef CONFIG_NUMA_BALANCING 1341#ifdef CONFIG_NUMA_BALANCING
1326 int numa_scan_seq; 1342 int numa_scan_seq;
1327 int numa_migrate_seq;
1328 unsigned int numa_scan_period; 1343 unsigned int numa_scan_period;
1344 unsigned int numa_scan_period_max;
1345 int numa_preferred_nid;
1346 int numa_migrate_deferred;
1347 unsigned long numa_migrate_retry;
1329 u64 node_stamp; /* migration stamp */ 1348 u64 node_stamp; /* migration stamp */
1330 struct callback_head numa_work; 1349 struct callback_head numa_work;
1350
1351 struct list_head numa_entry;
1352 struct numa_group *numa_group;
1353
1354 /*
1355 * Exponential decaying average of faults on a per-node basis.
1356 * Scheduling placement decisions are made based on the these counts.
1357 * The values remain static for the duration of a PTE scan
1358 */
1359 unsigned long *numa_faults;
1360 unsigned long total_numa_faults;
1361
1362 /*
1363 * numa_faults_buffer records faults per node during the current
1364 * scan window. When the scan completes, the counts in numa_faults
1365 * decay and these values are copied.
1366 */
1367 unsigned long *numa_faults_buffer;
1368
1369 /*
1370 * numa_faults_locality tracks if faults recorded during the last
1371 * scan window were remote/local. The task scan period is adapted
1372 * based on the locality of the faults with different weights
1373 * depending on whether they were shared or private faults
1374 */
1375 unsigned long numa_faults_locality[2];
1376
1377 unsigned long numa_pages_migrated;
1331#endif /* CONFIG_NUMA_BALANCING */ 1378#endif /* CONFIG_NUMA_BALANCING */
1332 1379
1333 struct rcu_head rcu; 1380 struct rcu_head rcu;
@@ -1412,16 +1459,33 @@ struct task_struct {
1412/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1459/* Future-safe accessor for struct task_struct's cpus_allowed. */
1413#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1460#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1414 1461
1462#define TNF_MIGRATED 0x01
1463#define TNF_NO_GROUP 0x02
1464#define TNF_SHARED 0x04
1465#define TNF_FAULT_LOCAL 0x08
1466
1415#ifdef CONFIG_NUMA_BALANCING 1467#ifdef CONFIG_NUMA_BALANCING
1416extern void task_numa_fault(int node, int pages, bool migrated); 1468extern void task_numa_fault(int last_node, int node, int pages, int flags);
1469extern pid_t task_numa_group_id(struct task_struct *p);
1417extern void set_numabalancing_state(bool enabled); 1470extern void set_numabalancing_state(bool enabled);
1471extern void task_numa_free(struct task_struct *p);
1472
1473extern unsigned int sysctl_numa_balancing_migrate_deferred;
1418#else 1474#else
1419static inline void task_numa_fault(int node, int pages, bool migrated) 1475static inline void task_numa_fault(int last_node, int node, int pages,
1476 int flags)
1420{ 1477{
1421} 1478}
1479static inline pid_t task_numa_group_id(struct task_struct *p)
1480{
1481 return 0;
1482}
1422static inline void set_numabalancing_state(bool enabled) 1483static inline void set_numabalancing_state(bool enabled)
1423{ 1484{
1424} 1485}
1486static inline void task_numa_free(struct task_struct *p)
1487{
1488}
1425#endif 1489#endif
1426 1490
1427static inline struct pid *task_pid(struct task_struct *task) 1491static inline struct pid *task_pid(struct task_struct *task)
@@ -1974,7 +2038,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
1974#else 2038#else
1975 static inline void kick_process(struct task_struct *tsk) { } 2039 static inline void kick_process(struct task_struct *tsk) { }
1976#endif 2040#endif
1977extern void sched_fork(struct task_struct *p); 2041extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
1978extern void sched_dead(struct task_struct *p); 2042extern void sched_dead(struct task_struct *p);
1979 2043
1980extern void proc_caches_init(void); 2044extern void proc_caches_init(void);
@@ -2401,11 +2465,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
2401 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); 2465 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
2402} 2466}
2403 2467
2404static inline int need_resched(void)
2405{
2406 return unlikely(test_thread_flag(TIF_NEED_RESCHED));
2407}
2408
2409/* 2468/*
2410 * cond_resched() and cond_resched_lock(): latency reduction via 2469 * cond_resched() and cond_resched_lock(): latency reduction via
2411 * explicit rescheduling in places that are safe. The return 2470 * explicit rescheduling in places that are safe. The return
@@ -2474,36 +2533,105 @@ static inline int tsk_is_polling(struct task_struct *p)
2474{ 2533{
2475 return task_thread_info(p)->status & TS_POLLING; 2534 return task_thread_info(p)->status & TS_POLLING;
2476} 2535}
2477static inline void current_set_polling(void) 2536static inline void __current_set_polling(void)
2478{ 2537{
2479 current_thread_info()->status |= TS_POLLING; 2538 current_thread_info()->status |= TS_POLLING;
2480} 2539}
2481 2540
2482static inline void current_clr_polling(void) 2541static inline bool __must_check current_set_polling_and_test(void)
2542{
2543 __current_set_polling();
2544
2545 /*
2546 * Polling state must be visible before we test NEED_RESCHED,
2547 * paired by resched_task()
2548 */
2549 smp_mb();
2550
2551 return unlikely(tif_need_resched());
2552}
2553
2554static inline void __current_clr_polling(void)
2483{ 2555{
2484 current_thread_info()->status &= ~TS_POLLING; 2556 current_thread_info()->status &= ~TS_POLLING;
2485 smp_mb__after_clear_bit(); 2557}
2558
2559static inline bool __must_check current_clr_polling_and_test(void)
2560{
2561 __current_clr_polling();
2562
2563 /*
2564 * Polling state must be visible before we test NEED_RESCHED,
2565 * paired by resched_task()
2566 */
2567 smp_mb();
2568
2569 return unlikely(tif_need_resched());
2486} 2570}
2487#elif defined(TIF_POLLING_NRFLAG) 2571#elif defined(TIF_POLLING_NRFLAG)
2488static inline int tsk_is_polling(struct task_struct *p) 2572static inline int tsk_is_polling(struct task_struct *p)
2489{ 2573{
2490 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); 2574 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
2491} 2575}
2492static inline void current_set_polling(void) 2576
2577static inline void __current_set_polling(void)
2493{ 2578{
2494 set_thread_flag(TIF_POLLING_NRFLAG); 2579 set_thread_flag(TIF_POLLING_NRFLAG);
2495} 2580}
2496 2581
2497static inline void current_clr_polling(void) 2582static inline bool __must_check current_set_polling_and_test(void)
2583{
2584 __current_set_polling();
2585
2586 /*
2587 * Polling state must be visible before we test NEED_RESCHED,
2588 * paired by resched_task()
2589 *
2590 * XXX: assumes set/clear bit are identical barrier wise.
2591 */
2592 smp_mb__after_clear_bit();
2593
2594 return unlikely(tif_need_resched());
2595}
2596
2597static inline void __current_clr_polling(void)
2498{ 2598{
2499 clear_thread_flag(TIF_POLLING_NRFLAG); 2599 clear_thread_flag(TIF_POLLING_NRFLAG);
2500} 2600}
2601
2602static inline bool __must_check current_clr_polling_and_test(void)
2603{
2604 __current_clr_polling();
2605
2606 /*
2607 * Polling state must be visible before we test NEED_RESCHED,
2608 * paired by resched_task()
2609 */
2610 smp_mb__after_clear_bit();
2611
2612 return unlikely(tif_need_resched());
2613}
2614
2501#else 2615#else
2502static inline int tsk_is_polling(struct task_struct *p) { return 0; } 2616static inline int tsk_is_polling(struct task_struct *p) { return 0; }
2503static inline void current_set_polling(void) { } 2617static inline void __current_set_polling(void) { }
2504static inline void current_clr_polling(void) { } 2618static inline void __current_clr_polling(void) { }
2619
2620static inline bool __must_check current_set_polling_and_test(void)
2621{
2622 return unlikely(tif_need_resched());
2623}
2624static inline bool __must_check current_clr_polling_and_test(void)
2625{
2626 return unlikely(tif_need_resched());
2627}
2505#endif 2628#endif
2506 2629
2630static __always_inline bool need_resched(void)
2631{
2632 return unlikely(tif_need_resched());
2633}
2634
2507/* 2635/*
2508 * Thread group CPU time accounting. 2636 * Thread group CPU time accounting.
2509 */ 2637 */
@@ -2545,6 +2673,11 @@ static inline unsigned int task_cpu(const struct task_struct *p)
2545 return task_thread_info(p)->cpu; 2673 return task_thread_info(p)->cpu;
2546} 2674}
2547 2675
2676static inline int task_node(const struct task_struct *p)
2677{
2678 return cpu_to_node(task_cpu(p));
2679}
2680
2548extern void set_task_cpu(struct task_struct *p, unsigned int cpu); 2681extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
2549 2682
2550#else 2683#else
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index bf8086b2506e..10d16c4fbe89 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
47extern unsigned int sysctl_numa_balancing_scan_delay; 47extern unsigned int sysctl_numa_balancing_scan_delay;
48extern unsigned int sysctl_numa_balancing_scan_period_min; 48extern unsigned int sysctl_numa_balancing_scan_period_min;
49extern unsigned int sysctl_numa_balancing_scan_period_max; 49extern unsigned int sysctl_numa_balancing_scan_period_max;
50extern unsigned int sysctl_numa_balancing_scan_period_reset;
51extern unsigned int sysctl_numa_balancing_scan_size; 50extern unsigned int sysctl_numa_balancing_scan_size;
52extern unsigned int sysctl_numa_balancing_settle_count; 51extern unsigned int sysctl_numa_balancing_settle_count;
53 52
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3b5e910d14ca..d2abbdb8c6aa 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -28,6 +28,7 @@ struct cpu_stop_work {
28}; 28};
29 29
30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); 30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
31int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
31void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, 32void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
32 struct cpu_stop_work *work_buf); 33 struct cpu_stop_work *work_buf);
33int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); 34int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e7e04736802f..fddbe2023a5d 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -104,8 +104,21 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
104#define test_thread_flag(flag) \ 104#define test_thread_flag(flag) \
105 test_ti_thread_flag(current_thread_info(), flag) 105 test_ti_thread_flag(current_thread_info(), flag)
106 106
107#define set_need_resched() set_thread_flag(TIF_NEED_RESCHED) 107static inline __deprecated void set_need_resched(void)
108#define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED) 108{
109 /*
110 * Use of this function in deprecated.
111 *
112 * As of this writing there are only a few users in the DRM tree left
113 * all of which are wrong and can be removed without causing too much
114 * grief.
115 *
116 * The DRM people are aware and are working on removing the last few
117 * instances.
118 */
119}
120
121#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
109 122
110#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK 123#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
111/* 124/*
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d3cf0d6e7712..12ae6ce997d6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -106,6 +106,8 @@ int arch_update_cpu_topology(void);
106 .last_balance = jiffies, \ 106 .last_balance = jiffies, \
107 .balance_interval = 1, \ 107 .balance_interval = 1, \
108 .smt_gain = 1178, /* 15% */ \ 108 .smt_gain = 1178, /* 15% */ \
109 .max_newidle_lb_cost = 0, \
110 .next_decay_max_lb_cost = jiffies, \
109} 111}
110#endif 112#endif
111#endif /* CONFIG_SCHED_SMT */ 113#endif /* CONFIG_SCHED_SMT */
@@ -135,6 +137,8 @@ int arch_update_cpu_topology(void);
135 , \ 137 , \
136 .last_balance = jiffies, \ 138 .last_balance = jiffies, \
137 .balance_interval = 1, \ 139 .balance_interval = 1, \
140 .max_newidle_lb_cost = 0, \
141 .next_decay_max_lb_cost = jiffies, \
138} 142}
139#endif 143#endif
140#endif /* CONFIG_SCHED_MC */ 144#endif /* CONFIG_SCHED_MC */
@@ -166,6 +170,8 @@ int arch_update_cpu_topology(void);
166 , \ 170 , \
167 .last_balance = jiffies, \ 171 .last_balance = jiffies, \
168 .balance_interval = 1, \ 172 .balance_interval = 1, \
173 .max_newidle_lb_cost = 0, \
174 .next_decay_max_lb_cost = jiffies, \
169} 175}
170#endif 176#endif
171 177
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 64f864651d86..633cac77f9f9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -672,31 +672,17 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty,
672#define wait_event_interruptible_tty(tty, wq, condition) \ 672#define wait_event_interruptible_tty(tty, wq, condition) \
673({ \ 673({ \
674 int __ret = 0; \ 674 int __ret = 0; \
675 if (!(condition)) { \ 675 if (!(condition)) \
676 __wait_event_interruptible_tty(tty, wq, condition, __ret); \ 676 __ret = __wait_event_interruptible_tty(tty, wq, \
677 } \ 677 condition); \
678 __ret; \ 678 __ret; \
679}) 679})
680 680
681#define __wait_event_interruptible_tty(tty, wq, condition, ret) \ 681#define __wait_event_interruptible_tty(tty, wq, condition) \
682do { \ 682 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
683 DEFINE_WAIT(__wait); \ 683 tty_unlock(tty); \
684 \
685 for (;;) { \
686 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
687 if (condition) \
688 break; \
689 if (!signal_pending(current)) { \
690 tty_unlock(tty); \
691 schedule(); \ 684 schedule(); \
692 tty_lock(tty); \ 685 tty_lock(tty))
693 continue; \
694 } \
695 ret = -ERESTARTSYS; \
696 break; \
697 } \
698 finish_wait(&wq, &__wait); \
699} while (0)
700 686
701#ifdef CONFIG_PROC_FS 687#ifdef CONFIG_PROC_FS
702extern void proc_tty_register_driver(struct tty_driver *); 688extern void proc_tty_register_driver(struct tty_driver *);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5ca0951e1855..9d8cf056e661 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -15,7 +15,7 @@
15 */ 15 */
16static inline void pagefault_disable(void) 16static inline void pagefault_disable(void)
17{ 17{
18 inc_preempt_count(); 18 preempt_count_inc();
19 /* 19 /*
20 * make sure to have issued the store before a pagefault 20 * make sure to have issued the store before a pagefault
21 * can hit. 21 * can hit.
@@ -30,11 +30,7 @@ static inline void pagefault_enable(void)
30 * the pagefault handler again. 30 * the pagefault handler again.
31 */ 31 */
32 barrier(); 32 barrier();
33 dec_preempt_count(); 33 preempt_count_dec();
34 /*
35 * make sure we do..
36 */
37 barrier();
38 preempt_check_resched(); 34 preempt_check_resched();
39} 35}
40 36
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a67fc1635592..3b23afa04d6b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1,7 +1,8 @@
1#ifndef _LINUX_WAIT_H 1#ifndef _LINUX_WAIT_H
2#define _LINUX_WAIT_H 2#define _LINUX_WAIT_H
3 3/*
4 4 * Linux wait queue related types and methods
5 */
5#include <linux/list.h> 6#include <linux/list.h>
6#include <linux/stddef.h> 7#include <linux/stddef.h>
7#include <linux/spinlock.h> 8#include <linux/spinlock.h>
@@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
13int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); 14int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
14 15
15struct __wait_queue { 16struct __wait_queue {
16 unsigned int flags; 17 unsigned int flags;
17#define WQ_FLAG_EXCLUSIVE 0x01 18#define WQ_FLAG_EXCLUSIVE 0x01
18 void *private; 19 void *private;
19 wait_queue_func_t func; 20 wait_queue_func_t func;
20 struct list_head task_list; 21 struct list_head task_list;
21}; 22};
22 23
23struct wait_bit_key { 24struct wait_bit_key {
24 void *flags; 25 void *flags;
25 int bit_nr; 26 int bit_nr;
26#define WAIT_ATOMIC_T_BIT_NR -1 27#define WAIT_ATOMIC_T_BIT_NR -1
27}; 28};
28 29
29struct wait_bit_queue { 30struct wait_bit_queue {
30 struct wait_bit_key key; 31 struct wait_bit_key key;
31 wait_queue_t wait; 32 wait_queue_t wait;
32}; 33};
33 34
34struct __wait_queue_head { 35struct __wait_queue_head {
35 spinlock_t lock; 36 spinlock_t lock;
36 struct list_head task_list; 37 struct list_head task_list;
37}; 38};
38typedef struct __wait_queue_head wait_queue_head_t; 39typedef struct __wait_queue_head wait_queue_head_t;
39 40
@@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
84 85
85static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) 86static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
86{ 87{
87 q->flags = 0; 88 q->flags = 0;
88 q->private = p; 89 q->private = p;
89 q->func = default_wake_function; 90 q->func = default_wake_function;
90} 91}
91 92
92static inline void init_waitqueue_func_entry(wait_queue_t *q, 93static inline void
93 wait_queue_func_t func) 94init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
94{ 95{
95 q->flags = 0; 96 q->flags = 0;
96 q->private = NULL; 97 q->private = NULL;
97 q->func = func; 98 q->func = func;
98} 99}
99 100
100static inline int waitqueue_active(wait_queue_head_t *q) 101static inline int waitqueue_active(wait_queue_head_t *q)
@@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
114/* 115/*
115 * Used for wake-one threads: 116 * Used for wake-one threads:
116 */ 117 */
117static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, 118static inline void
118 wait_queue_t *wait) 119__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
119{ 120{
120 wait->flags |= WQ_FLAG_EXCLUSIVE; 121 wait->flags |= WQ_FLAG_EXCLUSIVE;
121 __add_wait_queue(q, wait); 122 __add_wait_queue(q, wait);
@@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head,
127 list_add_tail(&new->task_list, &head->task_list); 128 list_add_tail(&new->task_list, &head->task_list);
128} 129}
129 130
130static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, 131static inline void
131 wait_queue_t *wait) 132__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
132{ 133{
133 wait->flags |= WQ_FLAG_EXCLUSIVE; 134 wait->flags |= WQ_FLAG_EXCLUSIVE;
134 __add_wait_queue_tail(q, wait); 135 __add_wait_queue_tail(q, wait);
135} 136}
136 137
137static inline void __remove_wait_queue(wait_queue_head_t *head, 138static inline void
138 wait_queue_t *old) 139__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
139{ 140{
140 list_del(&old->task_list); 141 list_del(&old->task_list);
141} 142}
142 143
143void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 144void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
144void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); 145void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
145void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, 146void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
146 void *key);
147void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); 147void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
148void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); 148void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
149void __wake_up_bit(wait_queue_head_t *, void *, int); 149void __wake_up_bit(wait_queue_head_t *, void *, int);
@@ -170,27 +170,64 @@ wait_queue_head_t *bit_waitqueue(void *, int);
170/* 170/*
171 * Wakeup macros to be used to report events to the targets. 171 * Wakeup macros to be used to report events to the targets.
172 */ 172 */
173#define wake_up_poll(x, m) \ 173#define wake_up_poll(x, m) \
174 __wake_up(x, TASK_NORMAL, 1, (void *) (m)) 174 __wake_up(x, TASK_NORMAL, 1, (void *) (m))
175#define wake_up_locked_poll(x, m) \ 175#define wake_up_locked_poll(x, m) \
176 __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) 176 __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
177#define wake_up_interruptible_poll(x, m) \ 177#define wake_up_interruptible_poll(x, m) \
178 __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) 178 __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
179#define wake_up_interruptible_sync_poll(x, m) \ 179#define wake_up_interruptible_sync_poll(x, m) \
180 __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) 180 __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
181 181
182#define __wait_event(wq, condition) \ 182#define ___wait_cond_timeout(condition) \
183do { \ 183({ \
184 DEFINE_WAIT(__wait); \ 184 bool __cond = (condition); \
185 if (__cond && !__ret) \
186 __ret = 1; \
187 __cond || !__ret; \
188})
189
190#define ___wait_is_interruptible(state) \
191 (!__builtin_constant_p(state) || \
192 state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
193
194#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \
195({ \
196 __label__ __out; \
197 wait_queue_t __wait; \
198 long __ret = ret; \
199 \
200 INIT_LIST_HEAD(&__wait.task_list); \
201 if (exclusive) \
202 __wait.flags = WQ_FLAG_EXCLUSIVE; \
203 else \
204 __wait.flags = 0; \
185 \ 205 \
186 for (;;) { \ 206 for (;;) { \
187 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ 207 long __int = prepare_to_wait_event(&wq, &__wait, state);\
208 \
188 if (condition) \ 209 if (condition) \
189 break; \ 210 break; \
190 schedule(); \ 211 \
212 if (___wait_is_interruptible(state) && __int) { \
213 __ret = __int; \
214 if (exclusive) { \
215 abort_exclusive_wait(&wq, &__wait, \
216 state, NULL); \
217 goto __out; \
218 } \
219 break; \
220 } \
221 \
222 cmd; \
191 } \ 223 } \
192 finish_wait(&wq, &__wait); \ 224 finish_wait(&wq, &__wait); \
193} while (0) 225__out: __ret; \
226})
227
228#define __wait_event(wq, condition) \
229 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
230 schedule())
194 231
195/** 232/**
196 * wait_event - sleep until a condition gets true 233 * wait_event - sleep until a condition gets true
@@ -204,29 +241,17 @@ do { \
204 * wake_up() has to be called after changing any variable that could 241 * wake_up() has to be called after changing any variable that could
205 * change the result of the wait condition. 242 * change the result of the wait condition.
206 */ 243 */
207#define wait_event(wq, condition) \ 244#define wait_event(wq, condition) \
208do { \ 245do { \
209 if (condition) \ 246 if (condition) \
210 break; \ 247 break; \
211 __wait_event(wq, condition); \ 248 __wait_event(wq, condition); \
212} while (0) 249} while (0)
213 250
214#define __wait_event_timeout(wq, condition, ret) \ 251#define __wait_event_timeout(wq, condition, timeout) \
215do { \ 252 ___wait_event(wq, ___wait_cond_timeout(condition), \
216 DEFINE_WAIT(__wait); \ 253 TASK_UNINTERRUPTIBLE, 0, timeout, \
217 \ 254 __ret = schedule_timeout(__ret))
218 for (;;) { \
219 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
220 if (condition) \
221 break; \
222 ret = schedule_timeout(ret); \
223 if (!ret) \
224 break; \
225 } \
226 if (!ret && (condition)) \
227 ret = 1; \
228 finish_wait(&wq, &__wait); \
229} while (0)
230 255
231/** 256/**
232 * wait_event_timeout - sleep until a condition gets true or a timeout elapses 257 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
@@ -248,28 +273,14 @@ do { \
248#define wait_event_timeout(wq, condition, timeout) \ 273#define wait_event_timeout(wq, condition, timeout) \
249({ \ 274({ \
250 long __ret = timeout; \ 275 long __ret = timeout; \
251 if (!(condition)) \ 276 if (!___wait_cond_timeout(condition)) \
252 __wait_event_timeout(wq, condition, __ret); \ 277 __ret = __wait_event_timeout(wq, condition, timeout); \
253 __ret; \ 278 __ret; \
254}) 279})
255 280
256#define __wait_event_interruptible(wq, condition, ret) \ 281#define __wait_event_interruptible(wq, condition) \
257do { \ 282 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
258 DEFINE_WAIT(__wait); \ 283 schedule())
259 \
260 for (;;) { \
261 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
262 if (condition) \
263 break; \
264 if (!signal_pending(current)) { \
265 schedule(); \
266 continue; \
267 } \
268 ret = -ERESTARTSYS; \
269 break; \
270 } \
271 finish_wait(&wq, &__wait); \
272} while (0)
273 284
274/** 285/**
275 * wait_event_interruptible - sleep until a condition gets true 286 * wait_event_interruptible - sleep until a condition gets true
@@ -290,31 +301,14 @@ do { \
290({ \ 301({ \
291 int __ret = 0; \ 302 int __ret = 0; \
292 if (!(condition)) \ 303 if (!(condition)) \
293 __wait_event_interruptible(wq, condition, __ret); \ 304 __ret = __wait_event_interruptible(wq, condition); \
294 __ret; \ 305 __ret; \
295}) 306})
296 307
297#define __wait_event_interruptible_timeout(wq, condition, ret) \ 308#define __wait_event_interruptible_timeout(wq, condition, timeout) \
298do { \ 309 ___wait_event(wq, ___wait_cond_timeout(condition), \
299 DEFINE_WAIT(__wait); \ 310 TASK_INTERRUPTIBLE, 0, timeout, \
300 \ 311 __ret = schedule_timeout(__ret))
301 for (;;) { \
302 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
303 if (condition) \
304 break; \
305 if (!signal_pending(current)) { \
306 ret = schedule_timeout(ret); \
307 if (!ret) \
308 break; \
309 continue; \
310 } \
311 ret = -ERESTARTSYS; \
312 break; \
313 } \
314 if (!ret && (condition)) \
315 ret = 1; \
316 finish_wait(&wq, &__wait); \
317} while (0)
318 312
319/** 313/**
320 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses 314 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
@@ -337,15 +331,15 @@ do { \
337#define wait_event_interruptible_timeout(wq, condition, timeout) \ 331#define wait_event_interruptible_timeout(wq, condition, timeout) \
338({ \ 332({ \
339 long __ret = timeout; \ 333 long __ret = timeout; \
340 if (!(condition)) \ 334 if (!___wait_cond_timeout(condition)) \
341 __wait_event_interruptible_timeout(wq, condition, __ret); \ 335 __ret = __wait_event_interruptible_timeout(wq, \
336 condition, timeout); \
342 __ret; \ 337 __ret; \
343}) 338})
344 339
345#define __wait_event_hrtimeout(wq, condition, timeout, state) \ 340#define __wait_event_hrtimeout(wq, condition, timeout, state) \
346({ \ 341({ \
347 int __ret = 0; \ 342 int __ret = 0; \
348 DEFINE_WAIT(__wait); \
349 struct hrtimer_sleeper __t; \ 343 struct hrtimer_sleeper __t; \
350 \ 344 \
351 hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ 345 hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \
@@ -356,25 +350,15 @@ do { \
356 current->timer_slack_ns, \ 350 current->timer_slack_ns, \
357 HRTIMER_MODE_REL); \ 351 HRTIMER_MODE_REL); \
358 \ 352 \
359 for (;;) { \ 353 __ret = ___wait_event(wq, condition, state, 0, 0, \
360 prepare_to_wait(&wq, &__wait, state); \
361 if (condition) \
362 break; \
363 if (state == TASK_INTERRUPTIBLE && \
364 signal_pending(current)) { \
365 __ret = -ERESTARTSYS; \
366 break; \
367 } \
368 if (!__t.task) { \ 354 if (!__t.task) { \
369 __ret = -ETIME; \ 355 __ret = -ETIME; \
370 break; \ 356 break; \
371 } \ 357 } \
372 schedule(); \ 358 schedule()); \
373 } \
374 \ 359 \
375 hrtimer_cancel(&__t.timer); \ 360 hrtimer_cancel(&__t.timer); \
376 destroy_hrtimer_on_stack(&__t.timer); \ 361 destroy_hrtimer_on_stack(&__t.timer); \
377 finish_wait(&wq, &__wait); \
378 __ret; \ 362 __ret; \
379}) 363})
380 364
@@ -428,33 +412,15 @@ do { \
428 __ret; \ 412 __ret; \
429}) 413})
430 414
431#define __wait_event_interruptible_exclusive(wq, condition, ret) \ 415#define __wait_event_interruptible_exclusive(wq, condition) \
432do { \ 416 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
433 DEFINE_WAIT(__wait); \ 417 schedule())
434 \
435 for (;;) { \
436 prepare_to_wait_exclusive(&wq, &__wait, \
437 TASK_INTERRUPTIBLE); \
438 if (condition) { \
439 finish_wait(&wq, &__wait); \
440 break; \
441 } \
442 if (!signal_pending(current)) { \
443 schedule(); \
444 continue; \
445 } \
446 ret = -ERESTARTSYS; \
447 abort_exclusive_wait(&wq, &__wait, \
448 TASK_INTERRUPTIBLE, NULL); \
449 break; \
450 } \
451} while (0)
452 418
453#define wait_event_interruptible_exclusive(wq, condition) \ 419#define wait_event_interruptible_exclusive(wq, condition) \
454({ \ 420({ \
455 int __ret = 0; \ 421 int __ret = 0; \
456 if (!(condition)) \ 422 if (!(condition)) \
457 __wait_event_interruptible_exclusive(wq, condition, __ret);\ 423 __ret = __wait_event_interruptible_exclusive(wq, condition);\
458 __ret; \ 424 __ret; \
459}) 425})
460 426
@@ -606,24 +572,8 @@ do { \
606 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) 572 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
607 573
608 574
609 575#define __wait_event_killable(wq, condition) \
610#define __wait_event_killable(wq, condition, ret) \ 576 ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
611do { \
612 DEFINE_WAIT(__wait); \
613 \
614 for (;;) { \
615 prepare_to_wait(&wq, &__wait, TASK_KILLABLE); \
616 if (condition) \
617 break; \
618 if (!fatal_signal_pending(current)) { \
619 schedule(); \
620 continue; \
621 } \
622 ret = -ERESTARTSYS; \
623 break; \
624 } \
625 finish_wait(&wq, &__wait); \
626} while (0)
627 577
628/** 578/**
629 * wait_event_killable - sleep until a condition gets true 579 * wait_event_killable - sleep until a condition gets true
@@ -644,26 +594,17 @@ do { \
644({ \ 594({ \
645 int __ret = 0; \ 595 int __ret = 0; \
646 if (!(condition)) \ 596 if (!(condition)) \
647 __wait_event_killable(wq, condition, __ret); \ 597 __ret = __wait_event_killable(wq, condition); \
648 __ret; \ 598 __ret; \
649}) 599})
650 600
651 601
652#define __wait_event_lock_irq(wq, condition, lock, cmd) \ 602#define __wait_event_lock_irq(wq, condition, lock, cmd) \
653do { \ 603 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
654 DEFINE_WAIT(__wait); \ 604 spin_unlock_irq(&lock); \
655 \ 605 cmd; \
656 for (;;) { \ 606 schedule(); \
657 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ 607 spin_lock_irq(&lock))
658 if (condition) \
659 break; \
660 spin_unlock_irq(&lock); \
661 cmd; \
662 schedule(); \
663 spin_lock_irq(&lock); \
664 } \
665 finish_wait(&wq, &__wait); \
666} while (0)
667 608
668/** 609/**
669 * wait_event_lock_irq_cmd - sleep until a condition gets true. The 610 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
@@ -723,26 +664,12 @@ do { \
723} while (0) 664} while (0)
724 665
725 666
726#define __wait_event_interruptible_lock_irq(wq, condition, \ 667#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \
727 lock, ret, cmd) \ 668 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
728do { \ 669 spin_unlock_irq(&lock); \
729 DEFINE_WAIT(__wait); \ 670 cmd; \
730 \ 671 schedule(); \
731 for (;;) { \ 672 spin_lock_irq(&lock))
732 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
733 if (condition) \
734 break; \
735 if (signal_pending(current)) { \
736 ret = -ERESTARTSYS; \
737 break; \
738 } \
739 spin_unlock_irq(&lock); \
740 cmd; \
741 schedule(); \
742 spin_lock_irq(&lock); \
743 } \
744 finish_wait(&wq, &__wait); \
745} while (0)
746 673
747/** 674/**
748 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. 675 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
@@ -772,10 +699,9 @@ do { \
772#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ 699#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \
773({ \ 700({ \
774 int __ret = 0; \ 701 int __ret = 0; \
775 \
776 if (!(condition)) \ 702 if (!(condition)) \
777 __wait_event_interruptible_lock_irq(wq, condition, \ 703 __ret = __wait_event_interruptible_lock_irq(wq, \
778 lock, __ret, cmd); \ 704 condition, lock, cmd); \
779 __ret; \ 705 __ret; \
780}) 706})
781 707
@@ -804,39 +730,24 @@ do { \
804#define wait_event_interruptible_lock_irq(wq, condition, lock) \ 730#define wait_event_interruptible_lock_irq(wq, condition, lock) \
805({ \ 731({ \
806 int __ret = 0; \ 732 int __ret = 0; \
807 \
808 if (!(condition)) \ 733 if (!(condition)) \
809 __wait_event_interruptible_lock_irq(wq, condition, \ 734 __ret = __wait_event_interruptible_lock_irq(wq, \
810 lock, __ret, ); \ 735 condition, lock,); \
811 __ret; \ 736 __ret; \
812}) 737})
813 738
814#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ 739#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \
815 lock, ret) \ 740 lock, timeout) \
816do { \ 741 ___wait_event(wq, ___wait_cond_timeout(condition), \
817 DEFINE_WAIT(__wait); \ 742 TASK_INTERRUPTIBLE, 0, ret, \
818 \ 743 spin_unlock_irq(&lock); \
819 for (;;) { \ 744 __ret = schedule_timeout(__ret); \
820 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ 745 spin_lock_irq(&lock));
821 if (condition) \
822 break; \
823 if (signal_pending(current)) { \
824 ret = -ERESTARTSYS; \
825 break; \
826 } \
827 spin_unlock_irq(&lock); \
828 ret = schedule_timeout(ret); \
829 spin_lock_irq(&lock); \
830 if (!ret) \
831 break; \
832 } \
833 finish_wait(&wq, &__wait); \
834} while (0)
835 746
836/** 747/**
837 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. 748 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
838 * The condition is checked under the lock. This is expected 749 * true or a timeout elapses. The condition is checked under
839 * to be called with the lock taken. 750 * the lock. This is expected to be called with the lock taken.
840 * @wq: the waitqueue to wait on 751 * @wq: the waitqueue to wait on
841 * @condition: a C expression for the event to wait for 752 * @condition: a C expression for the event to wait for
842 * @lock: a locked spinlock_t, which will be released before schedule() 753 * @lock: a locked spinlock_t, which will be released before schedule()
@@ -860,11 +771,10 @@ do { \
860#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ 771#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \
861 timeout) \ 772 timeout) \
862({ \ 773({ \
863 int __ret = timeout; \ 774 long __ret = timeout; \
864 \ 775 if (!___wait_cond_timeout(condition)) \
865 if (!(condition)) \ 776 __ret = __wait_event_interruptible_lock_irq_timeout( \
866 __wait_event_interruptible_lock_irq_timeout( \ 777 wq, condition, lock, timeout); \
867 wq, condition, lock, __ret); \
868 __ret; \ 778 __ret; \
869}) 779})
870 780
@@ -875,20 +785,18 @@ do { \
875 * We plan to remove these interfaces. 785 * We plan to remove these interfaces.
876 */ 786 */
877extern void sleep_on(wait_queue_head_t *q); 787extern void sleep_on(wait_queue_head_t *q);
878extern long sleep_on_timeout(wait_queue_head_t *q, 788extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
879 signed long timeout);
880extern void interruptible_sleep_on(wait_queue_head_t *q); 789extern void interruptible_sleep_on(wait_queue_head_t *q);
881extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, 790extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
882 signed long timeout);
883 791
884/* 792/*
885 * Waitqueues which are removed from the waitqueue_head at wakeup time 793 * Waitqueues which are removed from the waitqueue_head at wakeup time
886 */ 794 */
887void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); 795void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
888void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); 796void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
797long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
889void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); 798void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
890void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 799void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
891 unsigned int mode, void *key);
892int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 800int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
893int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 801int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
894 802
@@ -934,8 +842,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
934 * One uses wait_on_bit() where one is waiting for the bit to clear, 842 * One uses wait_on_bit() where one is waiting for the bit to clear,
935 * but has no intention of setting it. 843 * but has no intention of setting it.
936 */ 844 */
937static inline int wait_on_bit(void *word, int bit, 845static inline int
938 int (*action)(void *), unsigned mode) 846wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode)
939{ 847{
940 if (!test_bit(bit, word)) 848 if (!test_bit(bit, word))
941 return 0; 849 return 0;
@@ -958,8 +866,8 @@ static inline int wait_on_bit(void *word, int bit,
958 * One uses wait_on_bit_lock() where one is waiting for the bit to 866 * One uses wait_on_bit_lock() where one is waiting for the bit to
959 * clear with the intention of setting it, and when done, clearing it. 867 * clear with the intention of setting it, and when done, clearing it.
960 */ 868 */
961static inline int wait_on_bit_lock(void *word, int bit, 869static inline int
962 int (*action)(void *), unsigned mode) 870wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode)
963{ 871{
964 if (!test_and_set_bit(bit, word)) 872 if (!test_and_set_bit(bit, word))
965 return 0; 873 return 0;
@@ -983,5 +891,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
983 return 0; 891 return 0;
984 return out_of_line_wait_on_atomic_t(val, action, mode); 892 return out_of_line_wait_on_atomic_t(val, action, mode);
985} 893}
986 894
987#endif 895#endif /* _LINUX_WAIT_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 2e7d9947a10d..613381bcde40 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
100 /* 100 /*
101 * For all intents and purposes a preempted task is a running task. 101 * For all intents and purposes a preempted task is a running task.
102 */ 102 */
103 if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) 103 if (task_preempt_count(p) & PREEMPT_ACTIVE)
104 state = TASK_RUNNING | TASK_STATE_MAX; 104 state = TASK_RUNNING | TASK_STATE_MAX;
105#endif 105#endif
106 106
diff --git a/init/main.c b/init/main.c
index 63d3e8f2970c..379090fadac9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -693,7 +693,7 @@ int __init_or_module do_one_initcall(initcall_t fn)
693 693
694 if (preempt_count() != count) { 694 if (preempt_count() != count) {
695 sprintf(msgbuf, "preemption imbalance "); 695 sprintf(msgbuf, "preemption imbalance ");
696 preempt_count() = count; 696 preempt_count_set(count);
697 } 697 }
698 if (irqs_disabled()) { 698 if (irqs_disabled()) {
699 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); 699 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..e8ca97b5c386 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h> 12#include <linux/page_cgroup.h>
13#include <linux/log2.h>
13 14
14void foo(void) 15void foo(void)
15{ 16{
@@ -17,5 +18,8 @@ void foo(void)
17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 18 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 19 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); 20 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
21#ifdef CONFIG_SMP
22 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
23#endif
20 /* End of constants */ 24 /* End of constants */
21} 25}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 859c8dfd78a1..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
120 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
121 * calling the scheduler. 121 * calling the scheduler.
122 */ 122 */
123void __sched notrace preempt_schedule_context(void) 123asmlinkage void __sched notrace preempt_schedule_context(void)
124{ 124{
125 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
126 126
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..63aa50d7ce1e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
308 } 308 }
309 smpboot_park_threads(cpu); 309 smpboot_park_threads(cpu);
310 310
311 /*
312 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
313 * and RCU users of this state to go away such that all new such users
314 * will observe it.
315 *
316 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
317 * not imply sync_sched(), so explicitly call both.
318 */
319#ifdef CONFIG_PREEMPT
320 synchronize_sched();
321#endif
322 synchronize_rcu();
323
324 /*
325 * So now all preempt/rcu users must observe !cpu_active().
326 */
327
311 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 328 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
312 if (err) { 329 if (err) {
313 /* CPU didn't die: tell everyone. Can't complain. */ 330 /* CPU didn't die: tell everyone. Can't complain. */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
44 rcu_idle_enter(); 44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id()); 45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable(); 46 local_irq_enable();
47 while (!need_resched()) 47 while (!tif_need_resched())
48 cpu_relax(); 48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit(); 50 rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll(); 93 cpu_idle_poll();
94 } else { 94 } else {
95 current_clr_polling(); 95 if (!current_clr_polling_and_test()) {
96 if (!need_resched()) {
97 stop_critical_timings(); 96 stop_critical_timings();
98 rcu_idle_enter(); 97 rcu_idle_enter();
99 arch_cpu_idle(); 98 arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
103 } else { 102 } else {
104 local_irq_enable(); 103 local_irq_enable();
105 } 104 }
106 current_set_polling(); 105 __current_set_polling();
107 } 106 }
108 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
109 } 115 }
110 tick_nohz_idle_exit(); 116 tick_nohz_idle_exit();
111 schedule_preempt_disabled(); 117 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
129 */ 135 */
130 boot_init_stack_canary(); 136 boot_init_stack_canary();
131#endif 137#endif
132 current_set_polling(); 138 __current_set_polling();
133 arch_cpu_idle_prepare(); 139 arch_cpu_idle_prepare();
134 cpu_idle_loop(); 140 cpu_idle_loop();
135} 141}
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..c93be06dee87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
817#ifdef CONFIG_TRANSPARENT_HUGEPAGE 817#ifdef CONFIG_TRANSPARENT_HUGEPAGE
818 mm->pmd_huge_pte = NULL; 818 mm->pmd_huge_pte = NULL;
819#endif 819#endif
820#ifdef CONFIG_NUMA_BALANCING
821 mm->first_nid = NUMA_PTE_SCAN_INIT;
822#endif
823 if (!mm_init(mm, tsk)) 820 if (!mm_init(mm, tsk))
824 goto fail_nomem; 821 goto fail_nomem;
825 822
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313#endif 1310#endif
1314 1311
1315 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1316 sched_fork(p); 1313 sched_fork(clone_flags, p);
1317 1314
1318 retval = perf_event_init_task(p); 1315 retval = perf_event_init_task(p);
1319 if (retval) 1316 if (retval)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 32618b3fe4e6..1dc9f3604ad8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -898,6 +898,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
898 force_quiescent_state(rsp); /* Kick them all. */ 898 force_quiescent_state(rsp); /* Kick them all. */
899} 899}
900 900
901/*
902 * This function really isn't for public consumption, but RCU is special in
903 * that context switches can allow the state machine to make progress.
904 */
905extern void resched_cpu(int cpu);
906
901static void print_cpu_stall(struct rcu_state *rsp) 907static void print_cpu_stall(struct rcu_state *rsp)
902{ 908{
903 int cpu; 909 int cpu;
@@ -927,7 +933,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
927 3 * rcu_jiffies_till_stall_check() + 3; 933 3 * rcu_jiffies_till_stall_check() + 3;
928 raw_spin_unlock_irqrestore(&rnp->lock, flags); 934 raw_spin_unlock_irqrestore(&rnp->lock, flags);
929 935
930 set_need_resched(); /* kick ourselves to get things going. */ 936 /*
937 * Attempt to revive the RCU machinery by forcing a context switch.
938 *
939 * A context switch would normally allow the RCU state machine to make
940 * progress and it could be we're stuck in kernel space without context
941 * switches for an entirely unreasonable amount of time.
942 */
943 resched_cpu(smp_processor_id());
931} 944}
932 945
933static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 946static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..450a34b2a637 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
513 * might also involve a cross-CPU call to trigger the scheduler on 513 * might also involve a cross-CPU call to trigger the scheduler on
514 * the target CPU. 514 * the target CPU.
515 */ 515 */
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p) 516void resched_task(struct task_struct *p)
518{ 517{
519 int cpu; 518 int cpu;
520 519
521 assert_raw_spin_locked(&task_rq(p)->lock); 520 lockdep_assert_held(&task_rq(p)->lock);
522 521
523 if (test_tsk_need_resched(p)) 522 if (test_tsk_need_resched(p))
524 return; 523 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
526 set_tsk_need_resched(p); 525 set_tsk_need_resched(p);
527 526
528 cpu = task_cpu(p); 527 cpu = task_cpu(p);
529 if (cpu == smp_processor_id()) 528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return; 530 return;
531 }
531 532
532 /* NEED_RESCHED must be visible before we test polling */ 533 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 534 smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 raw_spin_unlock_irqrestore(&rq->lock, flags);
547} 548}
548 549
550#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON 551#ifdef CONFIG_NO_HZ_COMMON
550/* 552/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers 553 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
693 } 695 }
694} 696}
695 697
696#else /* !CONFIG_SMP */
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif /* CONFIG_SMP */ 698#endif /* CONFIG_SMP */
703 699
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{ 764{
769 update_rq_clock(rq); 765 update_rq_clock(rq);
770 sched_info_queued(p); 766 sched_info_queued(rq, p);
771 p->sched_class->enqueue_task(rq, p, flags); 767 p->sched_class->enqueue_task(rq, p, flags);
772} 768}
773 769
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{ 771{
776 update_rq_clock(rq); 772 update_rq_clock(rq);
777 sched_info_dequeued(p); 773 sched_info_dequeued(rq, p);
778 p->sched_class->dequeue_task(rq, p, flags); 774 p->sched_class->dequeue_task(rq, p, flags);
779} 775}
780 776
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
987 * ttwu() will sort out the placement. 983 * ttwu() will sort out the placement.
988 */ 984 */
989 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
990 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
991 987
992#ifdef CONFIG_LOCKDEP 988#ifdef CONFIG_LOCKDEP
993 /* 989 /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1017 __set_task_cpu(p, new_cpu); 1013 __set_task_cpu(p, new_cpu);
1018} 1014}
1019 1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029 /*
1030 * Task isn't running anymore; make it appear like we migrated
1031 * it before it went to sleep. This means on wakeup we make the
1032 * previous cpu our targer instead of where it really is.
1033 */
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080/*
1081 * Cross migrate two tasks
1082 */
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098 /*
1099 * These three tests are all lockless; this is OK since all of them
1100 * will be re-checked with proper locks held further down the line.
1101 */
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112
1113out:
1114 return ret;
1115}
1116
1020struct migration_arg { 1117struct migration_arg {
1021 struct task_struct *task; 1118 struct task_struct *task;
1022 int dest_cpu; 1119 int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
1236 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1333 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1237 */ 1334 */
1238static inline 1335static inline
1239int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1336int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1240{ 1337{
1241 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1338 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1242 1339
1243 /* 1340 /*
1244 * In order not to call set_task_cpu() on a blocking task we need 1341 * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1330 1427
1331 if (rq->idle_stamp) { 1428 if (rq->idle_stamp) {
1332 u64 delta = rq_clock(rq) - rq->idle_stamp; 1429 u64 delta = rq_clock(rq) - rq->idle_stamp;
1333 u64 max = 2*sysctl_sched_migration_cost; 1430 u64 max = 2*rq->max_idle_balance_cost;
1334 1431
1335 if (delta > max) 1432 update_avg(&rq->avg_idle, delta);
1433
1434 if (rq->avg_idle > max)
1336 rq->avg_idle = max; 1435 rq->avg_idle = max;
1337 else 1436
1338 update_avg(&rq->avg_idle, delta);
1339 rq->idle_stamp = 0; 1437 rq->idle_stamp = 0;
1340 } 1438 }
1341#endif 1439#endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
1396 1494
1397void scheduler_ipi(void) 1495void scheduler_ipi(void)
1398{ 1496{
1497 /*
1498 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI.
1501 */
1502 if (tif_need_resched())
1503 set_preempt_need_resched();
1504
1399 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1400 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
1401 && !got_nohz_idle_kick()) 1507 && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1513 if (p->sched_class->task_waking) 1619 if (p->sched_class->task_waking)
1514 p->sched_class->task_waking(p); 1620 p->sched_class->task_waking(p);
1515 1621
1516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1517 if (task_cpu(p) != cpu) { 1623 if (task_cpu(p) != cpu) {
1518 wake_flags |= WF_MIGRATED; 1624 wake_flags |= WF_MIGRATED;
1519 set_task_cpu(p, cpu); 1625 set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1595 * 1701 *
1596 * __sched_fork() is basic setup used by init_idle() too: 1702 * __sched_fork() is basic setup used by init_idle() too:
1597 */ 1703 */
1598static void __sched_fork(struct task_struct *p) 1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1599{ 1705{
1600 p->on_rq = 0; 1706 p->on_rq = 0;
1601 1707
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
1619 1725
1620#ifdef CONFIG_NUMA_BALANCING 1726#ifdef CONFIG_NUMA_BALANCING
1621 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1727 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1622 p->mm->numa_next_scan = jiffies; 1728 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1623 p->mm->numa_next_reset = jiffies;
1624 p->mm->numa_scan_seq = 0; 1729 p->mm->numa_scan_seq = 0;
1625 } 1730 }
1626 1731
1732 if (clone_flags & CLONE_VM)
1733 p->numa_preferred_nid = current->numa_preferred_nid;
1734 else
1735 p->numa_preferred_nid = -1;
1736
1627 p->node_stamp = 0ULL; 1737 p->node_stamp = 0ULL;
1628 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1738 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1629 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1630 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1739 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1631 p->numa_work.next = &p->numa_work; 1740 p->numa_work.next = &p->numa_work;
1741 p->numa_faults = NULL;
1742 p->numa_faults_buffer = NULL;
1743
1744 INIT_LIST_HEAD(&p->numa_entry);
1745 p->numa_group = NULL;
1632#endif /* CONFIG_NUMA_BALANCING */ 1746#endif /* CONFIG_NUMA_BALANCING */
1633} 1747}
1634 1748
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
1654/* 1768/*
1655 * fork()/clone()-time setup: 1769 * fork()/clone()-time setup:
1656 */ 1770 */
1657void sched_fork(struct task_struct *p) 1771void sched_fork(unsigned long clone_flags, struct task_struct *p)
1658{ 1772{
1659 unsigned long flags; 1773 unsigned long flags;
1660 int cpu = get_cpu(); 1774 int cpu = get_cpu();
1661 1775
1662 __sched_fork(p); 1776 __sched_fork(clone_flags, p);
1663 /* 1777 /*
1664 * We mark the process as running here. This guarantees that 1778 * We mark the process as running here. This guarantees that
1665 * nobody will actually run it, and a signal or other external 1779 * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
1717#if defined(CONFIG_SMP) 1831#if defined(CONFIG_SMP)
1718 p->on_cpu = 0; 1832 p->on_cpu = 0;
1719#endif 1833#endif
1720#ifdef CONFIG_PREEMPT_COUNT 1834 init_task_preempt_count(p);
1721 /* Want to start with kernel preemption disabled. */
1722 task_thread_info(p)->preempt_count = 1;
1723#endif
1724#ifdef CONFIG_SMP 1835#ifdef CONFIG_SMP
1725 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1836 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1726#endif 1837#endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
1747 * - cpus_allowed can change in the fork path 1858 * - cpus_allowed can change in the fork path
1748 * - any previously selected cpu might disappear through hotplug 1859 * - any previously selected cpu might disappear through hotplug
1749 */ 1860 */
1750 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1861 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1751#endif 1862#endif
1752 1863
1753 /* Initialize new task's runnable average */ 1864 /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next) 1949 struct task_struct *next)
1839{ 1950{
1840 trace_sched_switch(prev, next); 1951 trace_sched_switch(prev, next);
1841 sched_info_switch(prev, next); 1952 sched_info_switch(rq, prev, next);
1842 perf_event_task_sched_out(prev, next); 1953 perf_event_task_sched_out(prev, next);
1843 fire_sched_out_preempt_notifiers(prev, next); 1954 fire_sched_out_preempt_notifiers(prev, next);
1844 prepare_lock_switch(rq, next); 1955 prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1890 if (mm) 2001 if (mm)
1891 mmdrop(mm); 2002 mmdrop(mm);
1892 if (unlikely(prev_state == TASK_DEAD)) { 2003 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev);
2005
1893 /* 2006 /*
1894 * Remove function-return probe instances associated with this 2007 * Remove function-return probe instances associated with this
1895 * task and put them back on the free list. 2008 * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
2073 int dest_cpu; 2186 int dest_cpu;
2074 2187
2075 raw_spin_lock_irqsave(&p->pi_lock, flags); 2188 raw_spin_lock_irqsave(&p->pi_lock, flags);
2076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2189 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2077 if (dest_cpu == smp_processor_id()) 2190 if (dest_cpu == smp_processor_id())
2078 goto unlock; 2191 goto unlock;
2079 2192
@@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2215#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2328#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2216 defined(CONFIG_PREEMPT_TRACER)) 2329 defined(CONFIG_PREEMPT_TRACER))
2217 2330
2218void __kprobes add_preempt_count(int val) 2331void __kprobes preempt_count_add(int val)
2219{ 2332{
2220#ifdef CONFIG_DEBUG_PREEMPT 2333#ifdef CONFIG_DEBUG_PREEMPT
2221 /* 2334 /*
@@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
2224 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2337 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2225 return; 2338 return;
2226#endif 2339#endif
2227 preempt_count() += val; 2340 __preempt_count_add(val);
2228#ifdef CONFIG_DEBUG_PREEMPT 2341#ifdef CONFIG_DEBUG_PREEMPT
2229 /* 2342 /*
2230 * Spinlock count overflowing soon? 2343 * Spinlock count overflowing soon?
@@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
2235 if (preempt_count() == val) 2348 if (preempt_count() == val)
2236 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2349 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2237} 2350}
2238EXPORT_SYMBOL(add_preempt_count); 2351EXPORT_SYMBOL(preempt_count_add);
2239 2352
2240void __kprobes sub_preempt_count(int val) 2353void __kprobes preempt_count_sub(int val)
2241{ 2354{
2242#ifdef CONFIG_DEBUG_PREEMPT 2355#ifdef CONFIG_DEBUG_PREEMPT
2243 /* 2356 /*
@@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
2255 2368
2256 if (preempt_count() == val) 2369 if (preempt_count() == val)
2257 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2370 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2258 preempt_count() -= val; 2371 __preempt_count_sub(val);
2259} 2372}
2260EXPORT_SYMBOL(sub_preempt_count); 2373EXPORT_SYMBOL(preempt_count_sub);
2261 2374
2262#endif 2375#endif
2263 2376
@@ -2430,6 +2543,7 @@ need_resched:
2430 put_prev_task(rq, prev); 2543 put_prev_task(rq, prev);
2431 next = pick_next_task(rq); 2544 next = pick_next_task(rq);
2432 clear_tsk_need_resched(prev); 2545 clear_tsk_need_resched(prev);
2546 clear_preempt_need_resched();
2433 rq->skip_clock_update = 0; 2547 rq->skip_clock_update = 0;
2434 2548
2435 if (likely(prev != next)) { 2549 if (likely(prev != next)) {
@@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
2520 return; 2634 return;
2521 2635
2522 do { 2636 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE); 2637 __preempt_count_add(PREEMPT_ACTIVE);
2524 __schedule(); 2638 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2639 __preempt_count_sub(PREEMPT_ACTIVE);
2526 2640
2527 /* 2641 /*
2528 * Check again in case we missed a preemption opportunity 2642 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
2541 */ 2655 */
2542asmlinkage void __sched preempt_schedule_irq(void) 2656asmlinkage void __sched preempt_schedule_irq(void)
2543{ 2657{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state; 2658 enum ctx_state prev_state;
2546 2659
2547 /* Catch callers which need to be fixed */ 2660 /* Catch callers which need to be fixed */
2548 BUG_ON(ti->preempt_count || !irqs_disabled()); 2661 BUG_ON(preempt_count() || !irqs_disabled());
2549 2662
2550 prev_state = exception_enter(); 2663 prev_state = exception_enter();
2551 2664
2552 do { 2665 do {
2553 add_preempt_count(PREEMPT_ACTIVE); 2666 __preempt_count_add(PREEMPT_ACTIVE);
2554 local_irq_enable(); 2667 local_irq_enable();
2555 __schedule(); 2668 __schedule();
2556 local_irq_disable(); 2669 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE); 2670 __preempt_count_sub(PREEMPT_ACTIVE);
2558 2671
2559 /* 2672 /*
2560 * Check again in case we missed a preemption opportunity 2673 * Check again in case we missed a preemption opportunity
@@ -3598,13 +3711,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3598 struct task_struct *p; 3711 struct task_struct *p;
3599 int retval; 3712 int retval;
3600 3713
3601 get_online_cpus();
3602 rcu_read_lock(); 3714 rcu_read_lock();
3603 3715
3604 p = find_process_by_pid(pid); 3716 p = find_process_by_pid(pid);
3605 if (!p) { 3717 if (!p) {
3606 rcu_read_unlock(); 3718 rcu_read_unlock();
3607 put_online_cpus();
3608 return -ESRCH; 3719 return -ESRCH;
3609 } 3720 }
3610 3721
@@ -3661,7 +3772,6 @@ out_free_cpus_allowed:
3661 free_cpumask_var(cpus_allowed); 3772 free_cpumask_var(cpus_allowed);
3662out_put_task: 3773out_put_task:
3663 put_task_struct(p); 3774 put_task_struct(p);
3664 put_online_cpus();
3665 return retval; 3775 return retval;
3666} 3776}
3667 3777
@@ -3706,7 +3816,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3706 unsigned long flags; 3816 unsigned long flags;
3707 int retval; 3817 int retval;
3708 3818
3709 get_online_cpus();
3710 rcu_read_lock(); 3819 rcu_read_lock();
3711 3820
3712 retval = -ESRCH; 3821 retval = -ESRCH;
@@ -3719,12 +3828,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3719 goto out_unlock; 3828 goto out_unlock;
3720 3829
3721 raw_spin_lock_irqsave(&p->pi_lock, flags); 3830 raw_spin_lock_irqsave(&p->pi_lock, flags);
3722 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3831 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3723 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3832 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3724 3833
3725out_unlock: 3834out_unlock:
3726 rcu_read_unlock(); 3835 rcu_read_unlock();
3727 put_online_cpus();
3728 3836
3729 return retval; 3837 return retval;
3730} 3838}
@@ -3794,16 +3902,11 @@ SYSCALL_DEFINE0(sched_yield)
3794 return 0; 3902 return 0;
3795} 3903}
3796 3904
3797static inline int should_resched(void)
3798{
3799 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3800}
3801
3802static void __cond_resched(void) 3905static void __cond_resched(void)
3803{ 3906{
3804 add_preempt_count(PREEMPT_ACTIVE); 3907 __preempt_count_add(PREEMPT_ACTIVE);
3805 __schedule(); 3908 __schedule();
3806 sub_preempt_count(PREEMPT_ACTIVE); 3909 __preempt_count_sub(PREEMPT_ACTIVE);
3807} 3910}
3808 3911
3809int __sched _cond_resched(void) 3912int __sched _cond_resched(void)
@@ -4186,7 +4289,7 @@ void init_idle(struct task_struct *idle, int cpu)
4186 4289
4187 raw_spin_lock_irqsave(&rq->lock, flags); 4290 raw_spin_lock_irqsave(&rq->lock, flags);
4188 4291
4189 __sched_fork(idle); 4292 __sched_fork(0, idle);
4190 idle->state = TASK_RUNNING; 4293 idle->state = TASK_RUNNING;
4191 idle->se.exec_start = sched_clock(); 4294 idle->se.exec_start = sched_clock();
4192 4295
@@ -4212,7 +4315,7 @@ void init_idle(struct task_struct *idle, int cpu)
4212 raw_spin_unlock_irqrestore(&rq->lock, flags); 4315 raw_spin_unlock_irqrestore(&rq->lock, flags);
4213 4316
4214 /* Set the preempt count _outside_ the spinlocks! */ 4317 /* Set the preempt count _outside_ the spinlocks! */
4215 task_thread_info(idle)->preempt_count = 0; 4318 init_idle_preempt_count(idle, cpu);
4216 4319
4217 /* 4320 /*
4218 * The idle tasks have their own, simple scheduling class: 4321 * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4449,53 @@ fail:
4346 return ret; 4449 return ret;
4347} 4450}
4348 4451
4452#ifdef CONFIG_NUMA_BALANCING
4453/* Migrate current task p to target_cpu */
4454int migrate_task_to(struct task_struct *p, int target_cpu)
4455{
4456 struct migration_arg arg = { p, target_cpu };
4457 int curr_cpu = task_cpu(p);
4458
4459 if (curr_cpu == target_cpu)
4460 return 0;
4461
4462 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4463 return -EINVAL;
4464
4465 /* TODO: This is not properly updating schedstats */
4466
4467 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4468}
4469
4470/*
4471 * Requeue a task on a given node and accurately track the number of NUMA
4472 * tasks on the runqueues
4473 */
4474void sched_setnuma(struct task_struct *p, int nid)
4475{
4476 struct rq *rq;
4477 unsigned long flags;
4478 bool on_rq, running;
4479
4480 rq = task_rq_lock(p, &flags);
4481 on_rq = p->on_rq;
4482 running = task_current(rq, p);
4483
4484 if (on_rq)
4485 dequeue_task(rq, p, 0);
4486 if (running)
4487 p->sched_class->put_prev_task(rq, p);
4488
4489 p->numa_preferred_nid = nid;
4490
4491 if (running)
4492 p->sched_class->set_curr_task(rq);
4493 if (on_rq)
4494 enqueue_task(rq, p, 0);
4495 task_rq_unlock(rq, p, &flags);
4496}
4497#endif
4498
4349/* 4499/*
4350 * migration_cpu_stop - this will be executed by a highprio stopper thread 4500 * migration_cpu_stop - this will be executed by a highprio stopper thread
4351 * and performs thread migration by bumping thread off CPU then 4501 * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5269,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5119DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5269DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size); 5270DEFINE_PER_CPU(int, sd_llc_size);
5121DEFINE_PER_CPU(int, sd_llc_id); 5271DEFINE_PER_CPU(int, sd_llc_id);
5272DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5122 5273
5123static void update_top_cache_domain(int cpu) 5274static void update_top_cache_domain(int cpu)
5124{ 5275{
@@ -5135,6 +5286,9 @@ static void update_top_cache_domain(int cpu)
5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5286 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size; 5287 per_cpu(sd_llc_size, cpu) = size;
5137 per_cpu(sd_llc_id, cpu) = id; 5288 per_cpu(sd_llc_id, cpu) = id;
5289
5290 sd = lowest_flag_domain(cpu, SD_NUMA);
5291 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5138} 5292}
5139 5293
5140/* 5294/*
@@ -5654,6 +5808,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5654 | 0*SD_SHARE_PKG_RESOURCES 5808 | 0*SD_SHARE_PKG_RESOURCES
5655 | 1*SD_SERIALIZE 5809 | 1*SD_SERIALIZE
5656 | 0*SD_PREFER_SIBLING 5810 | 0*SD_PREFER_SIBLING
5811 | 1*SD_NUMA
5657 | sd_local_flags(level) 5812 | sd_local_flags(level)
5658 , 5813 ,
5659 .last_balance = jiffies, 5814 .last_balance = jiffies,
@@ -6335,14 +6490,17 @@ void __init sched_init_smp(void)
6335 6490
6336 sched_init_numa(); 6491 sched_init_numa();
6337 6492
6338 get_online_cpus(); 6493 /*
6494 * There's no userspace yet to cause hotplug operations; hence all the
6495 * cpu masks are stable and all blatant races in the below code cannot
6496 * happen.
6497 */
6339 mutex_lock(&sched_domains_mutex); 6498 mutex_lock(&sched_domains_mutex);
6340 init_sched_domains(cpu_active_mask); 6499 init_sched_domains(cpu_active_mask);
6341 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6500 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6342 if (cpumask_empty(non_isolated_cpus)) 6501 if (cpumask_empty(non_isolated_cpus))
6343 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6502 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6344 mutex_unlock(&sched_domains_mutex); 6503 mutex_unlock(&sched_domains_mutex);
6345 put_online_cpus();
6346 6504
6347 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6505 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6348 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6506 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6663,7 @@ void __init sched_init(void)
6505 rq->online = 0; 6663 rq->online = 0;
6506 rq->idle_stamp = 0; 6664 rq->idle_stamp = 0;
6507 rq->avg_idle = 2*sysctl_sched_migration_cost; 6665 rq->avg_idle = 2*sysctl_sched_migration_cost;
6666 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6508 6667
6509 INIT_LIST_HEAD(&rq->cfs_tasks); 6668 INIT_LIST_HEAD(&rq->cfs_tasks);
6510 6669
@@ -7277,7 +7436,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7277 7436
7278 runtime_enabled = quota != RUNTIME_INF; 7437 runtime_enabled = quota != RUNTIME_INF;
7279 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7438 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7280 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7439 /*
7440 * If we need to toggle cfs_bandwidth_used, off->on must occur
7441 * before making related changes, and on->off must occur afterwards
7442 */
7443 if (runtime_enabled && !runtime_was_enabled)
7444 cfs_bandwidth_usage_inc();
7281 raw_spin_lock_irq(&cfs_b->lock); 7445 raw_spin_lock_irq(&cfs_b->lock);
7282 cfs_b->period = ns_to_ktime(period); 7446 cfs_b->period = ns_to_ktime(period);
7283 cfs_b->quota = quota; 7447 cfs_b->quota = quota;
@@ -7303,6 +7467,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7303 unthrottle_cfs_rq(cfs_rq); 7467 unthrottle_cfs_rq(cfs_rq);
7304 raw_spin_unlock_irq(&rq->lock); 7468 raw_spin_unlock_irq(&rq->lock);
7305 } 7469 }
7470 if (runtime_was_enabled && !runtime_enabled)
7471 cfs_bandwidth_usage_dec();
7306out_unlock: 7472out_unlock:
7307 mutex_unlock(&cfs_constraints_mutex); 7473 mutex_unlock(&cfs_constraints_mutex);
7308 7474
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 196559994f7c..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h>
18 19
19#include "sched.h" 20#include "sched.h"
20 21
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
137 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 138 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
138 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
139#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
143#endif
140#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
141 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
142#endif 146#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 read_lock_irqsave(&tasklist_lock, flags); 163 read_lock_irqsave(&tasklist_lock, flags);
160 164
161 do_each_thread(g, p) { 165 do_each_thread(g, p) {
162 if (!p->on_rq || task_cpu(p) != rq_cpu) 166 if (task_cpu(p) != rq_cpu)
163 continue; 167 continue;
164 168
165 print_task(m, rq, p); 169 print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
225 atomic_read(&cfs_rq->tg->runnable_avg)); 229 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 230#endif
227#endif 231#endif
232#ifdef CONFIG_CFS_BANDWIDTH
233 SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
234 cfs_rq->tg->cfs_bandwidth.timer_active);
235 SEQ_printf(m, " .%-30s: %d\n", "throttled",
236 cfs_rq->throttled);
237 SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
238 cfs_rq->throttle_count);
239#endif
228 240
229#ifdef CONFIG_FAIR_GROUP_SCHED 241#ifdef CONFIG_FAIR_GROUP_SCHED
230 print_cfs_group_stats(m, cpu, cfs_rq->tg); 242 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
345 cpu_clk = local_clock(); 357 cpu_clk = local_clock();
346 local_irq_restore(flags); 358 local_irq_restore(flags);
347 359
348 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", 360 SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
349 init_utsname()->release, 361 init_utsname()->release,
350 (int)strcspn(init_utsname()->version, " "), 362 (int)strcspn(init_utsname()->version, " "),
351 init_utsname()->version); 363 init_utsname()->version);
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
488 500
489__initcall(init_sched_debug_procfs); 501__initcall(init_sched_debug_procfs);
490 502
503#define __P(F) \
504 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
505#define P(F) \
506 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
507#define __PN(F) \
508 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
509#define PN(F) \
510 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
511
512
513static void sched_show_numa(struct task_struct *p, struct seq_file *m)
514{
515#ifdef CONFIG_NUMA_BALANCING
516 struct mempolicy *pol;
517 int node, i;
518
519 if (p->mm)
520 P(mm->numa_scan_seq);
521
522 task_lock(p);
523 pol = p->mempolicy;
524 if (pol && !(pol->flags & MPOL_F_MORON))
525 pol = NULL;
526 mpol_get(pol);
527 task_unlock(p);
528
529 SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
530
531 for_each_online_node(node) {
532 for (i = 0; i < 2; i++) {
533 unsigned long nr_faults = -1;
534 int cpu_current, home_node;
535
536 if (p->numa_faults)
537 nr_faults = p->numa_faults[2*node + i];
538
539 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes));
541
542 home_node = (p->numa_preferred_nid == node);
543
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults);
546 }
547 }
548
549 mpol_put(pol);
550#endif
551}
552
491void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 553void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 554{
493 unsigned long nr_switches; 555 unsigned long nr_switches;
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
591 SEQ_printf(m, "%-45s:%21Ld\n", 653 SEQ_printf(m, "%-45s:%21Ld\n",
592 "clock-delta", (long long)(t1-t0)); 654 "clock-delta", (long long)(t1-t0));
593 } 655 }
656
657 sched_show_numa(p, m);
594} 658}
595 659
596void proc_sched_set_task(struct task_struct *p) 660void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..41c02b6b090e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 820
819#ifdef CONFIG_NUMA_BALANCING 821#ifdef CONFIG_NUMA_BALANCING
820/* 822/*
821 * numa task sample period in ms 823 * Approximate time to scan a full NUMA task in ms. The task scan period is
824 * calculated based on the tasks virtual memory size and
825 * numa_balancing_scan_size.
822 */ 826 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 827unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 828unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 829
827/* Portion of address space to scan in MB */ 830/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 831unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 835
833static void task_numa_placement(struct task_struct *p) 836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
844static unsigned int task_nr_scan_windows(struct task_struct *p)
845{
846 unsigned long rss = 0;
847 unsigned long nr_scan_pages;
848
849 /*
850 * Calculations based on RSS as non-present and empty pages are skipped
851 * by the PTE scanner and NUMA hinting faults should be trapped based
852 * on resident pages
853 */
854 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
855 rss = get_mm_rss(p->mm);
856 if (!rss)
857 rss = nr_scan_pages;
858
859 rss = round_up(rss, nr_scan_pages);
860 return rss / nr_scan_pages;
861}
862
863/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
864#define MAX_SCAN_WINDOW 2560
865
866static unsigned int task_scan_min(struct task_struct *p)
867{
868 unsigned int scan, floor;
869 unsigned int windows = 1;
870
871 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
872 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
873 floor = 1000 / windows;
874
875 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
876 return max_t(unsigned int, floor, scan);
877}
878
879static unsigned int task_scan_max(struct task_struct *p)
880{
881 unsigned int smin = task_scan_min(p);
882 unsigned int smax;
883
884 /* Watch for min being lower than max due to floor calculations */
885 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
886 return max(smin, smax);
887}
888
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1);
901 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
902}
903
904static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
905{
906 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
907 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
908}
909
910struct numa_group {
911 atomic_t refcount;
912
913 spinlock_t lock; /* nr_tasks, tasks */
914 int nr_tasks;
915 pid_t gid;
916 struct list_head task_list;
917
918 struct rcu_head rcu;
919 unsigned long total_faults;
920 unsigned long faults[0];
921};
922
923pid_t task_numa_group_id(struct task_struct *p)
924{
925 return p->numa_group ? p->numa_group->gid : 0;
926}
927
928static inline int task_faults_idx(int nid, int priv)
929{
930 return 2 * nid + priv;
931}
932
933static inline unsigned long task_faults(struct task_struct *p, int nid)
934{
935 if (!p->numa_faults)
936 return 0;
937
938 return p->numa_faults[task_faults_idx(nid, 0)] +
939 p->numa_faults[task_faults_idx(nid, 1)];
940}
941
942static inline unsigned long group_faults(struct task_struct *p, int nid)
943{
944 if (!p->numa_group)
945 return 0;
946
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
948}
949
950/*
951 * These return the fraction of accesses done by a particular task, or
952 * task group, on a particular numa node. The group weight is given a
953 * larger multiplier, in order to group tasks together that are almost
954 * evenly spread out between numa nodes.
955 */
956static inline unsigned long task_weight(struct task_struct *p, int nid)
957{
958 unsigned long total_faults;
959
960 if (!p->numa_faults)
961 return 0;
962
963 total_faults = p->total_numa_faults;
964
965 if (!total_faults)
966 return 0;
967
968 return 1000 * task_faults(p, nid) / total_faults;
969}
970
971static inline unsigned long group_weight(struct task_struct *p, int nid)
834{ 972{
835 int seq; 973 if (!p->numa_group || !p->numa_group->total_faults)
974 return 0;
975
976 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
977}
978
979static unsigned long weighted_cpuload(const int cpu);
980static unsigned long source_load(int cpu, int type);
981static unsigned long target_load(int cpu, int type);
982static unsigned long power_of(int cpu);
983static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
984
985/* Cached statistics for all CPUs within a node */
986struct numa_stats {
987 unsigned long nr_running;
988 unsigned long load;
989
990 /* Total compute capacity of CPUs on a node */
991 unsigned long power;
992
993 /* Approximate capacity in terms of runnable tasks on a node */
994 unsigned long capacity;
995 int has_capacity;
996};
997
998/*
999 * XXX borrowed from update_sg_lb_stats
1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{
1003 int cpu;
1004
1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 struct rq *rq = cpu_rq(cpu);
1008
1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu);
1012 }
1013
1014 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 ns->has_capacity = (ns->nr_running < ns->capacity);
1017}
1018
1019struct task_numa_env {
1020 struct task_struct *p;
1021
1022 int src_cpu, src_nid;
1023 int dst_cpu, dst_nid;
1024
1025 struct numa_stats src_stats, dst_stats;
1026
1027 int imbalance_pct, idx;
1028
1029 struct task_struct *best_task;
1030 long best_imp;
1031 int best_cpu;
1032};
1033
1034static void task_numa_assign(struct task_numa_env *env,
1035 struct task_struct *p, long imp)
1036{
1037 if (env->best_task)
1038 put_task_struct(env->best_task);
1039 if (p)
1040 get_task_struct(p);
1041
1042 env->best_task = p;
1043 env->best_imp = imp;
1044 env->best_cpu = env->dst_cpu;
1045}
1046
1047/*
1048 * This checks if the overall compute and NUMA accesses of the system would
1049 * be improved if the source tasks was migrated to the target dst_cpu taking
1050 * into account that it might be best if task running on the dst_cpu should
1051 * be exchanged with the source task
1052 */
1053static void task_numa_compare(struct task_numa_env *env,
1054 long taskimp, long groupimp)
1055{
1056 struct rq *src_rq = cpu_rq(env->src_cpu);
1057 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1058 struct task_struct *cur;
1059 long dst_load, src_load;
1060 long load;
1061 long imp = (groupimp > 0) ? groupimp : taskimp;
1062
1063 rcu_read_lock();
1064 cur = ACCESS_ONCE(dst_rq->curr);
1065 if (cur->pid == 0) /* idle */
1066 cur = NULL;
1067
1068 /*
1069 * "imp" is the fault differential for the source task between the
1070 * source and destination node. Calculate the total differential for
1071 * the source task and potential destination task. The more negative
1072 * the value is, the more rmeote accesses that would be expected to
1073 * be incurred if the tasks were swapped.
1074 */
1075 if (cur) {
1076 /* Skip this swap candidate if cannot move to the source cpu */
1077 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1078 goto unlock;
1079
1080 /*
1081 * If dst and source tasks are in the same NUMA group, or not
1082 * in any group then look only at task weights.
1083 */
1084 if (cur->numa_group == env->p->numa_group) {
1085 imp = taskimp + task_weight(cur, env->src_nid) -
1086 task_weight(cur, env->dst_nid);
1087 /*
1088 * Add some hysteresis to prevent swapping the
1089 * tasks within a group over tiny differences.
1090 */
1091 if (cur->numa_group)
1092 imp -= imp/16;
1093 } else {
1094 /*
1095 * Compare the group weights. If a task is all by
1096 * itself (not part of a group), use the task weight
1097 * instead.
1098 */
1099 if (env->p->numa_group)
1100 imp = groupimp;
1101 else
1102 imp = taskimp;
1103
1104 if (cur->numa_group)
1105 imp += group_weight(cur, env->src_nid) -
1106 group_weight(cur, env->dst_nid);
1107 else
1108 imp += task_weight(cur, env->src_nid) -
1109 task_weight(cur, env->dst_nid);
1110 }
1111 }
1112
1113 if (imp < env->best_imp)
1114 goto unlock;
1115
1116 if (!cur) {
1117 /* Is there capacity at our destination? */
1118 if (env->src_stats.has_capacity &&
1119 !env->dst_stats.has_capacity)
1120 goto unlock;
1121
1122 goto balance;
1123 }
1124
1125 /* Balance doesn't matter much if we're running a task per cpu */
1126 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1127 goto assign;
1128
1129 /*
1130 * In the overloaded case, try and keep the load balanced.
1131 */
1132balance:
1133 dst_load = env->dst_stats.load;
1134 src_load = env->src_stats.load;
1135
1136 /* XXX missing power terms */
1137 load = task_h_load(env->p);
1138 dst_load += load;
1139 src_load -= load;
1140
1141 if (cur) {
1142 load = task_h_load(cur);
1143 dst_load -= load;
1144 src_load += load;
1145 }
1146
1147 /* make src_load the smaller */
1148 if (dst_load < src_load)
1149 swap(dst_load, src_load);
1150
1151 if (src_load * env->imbalance_pct < dst_load * 100)
1152 goto unlock;
1153
1154assign:
1155 task_numa_assign(env, cur, imp);
1156unlock:
1157 rcu_read_unlock();
1158}
1159
1160static void task_numa_find_cpu(struct task_numa_env *env,
1161 long taskimp, long groupimp)
1162{
1163 int cpu;
1164
1165 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1166 /* Skip this CPU if the source task cannot migrate */
1167 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1168 continue;
1169
1170 env->dst_cpu = cpu;
1171 task_numa_compare(env, taskimp, groupimp);
1172 }
1173}
1174
1175static int task_numa_migrate(struct task_struct *p)
1176{
1177 struct task_numa_env env = {
1178 .p = p,
1179
1180 .src_cpu = task_cpu(p),
1181 .src_nid = task_node(p),
1182
1183 .imbalance_pct = 112,
1184
1185 .best_task = NULL,
1186 .best_imp = 0,
1187 .best_cpu = -1
1188 };
1189 struct sched_domain *sd;
1190 unsigned long taskweight, groupweight;
1191 int nid, ret;
1192 long taskimp, groupimp;
1193
1194 /*
1195 * Pick the lowest SD_NUMA domain, as that would have the smallest
1196 * imbalance and would be the first to start moving tasks about.
1197 *
1198 * And we want to avoid any moving of tasks about, as that would create
1199 * random movement of tasks -- counter the numa conditions we're trying
1200 * to satisfy here.
1201 */
1202 rcu_read_lock();
1203 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207 taskweight = task_weight(p, env.src_nid);
1208 groupweight = group_weight(p, env.src_nid);
1209 update_numa_stats(&env.src_stats, env.src_nid);
1210 env.dst_nid = p->numa_preferred_nid;
1211 taskimp = task_weight(p, env.dst_nid) - taskweight;
1212 groupimp = group_weight(p, env.dst_nid) - groupweight;
1213 update_numa_stats(&env.dst_stats, env.dst_nid);
1214
1215 /* If the preferred nid has capacity, try to use it. */
1216 if (env.dst_stats.has_capacity)
1217 task_numa_find_cpu(&env, taskimp, groupimp);
1218
1219 /* No space available on the preferred nid. Look elsewhere. */
1220 if (env.best_cpu == -1) {
1221 for_each_online_node(nid) {
1222 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1223 continue;
1224
1225 /* Only consider nodes where both task and groups benefit */
1226 taskimp = task_weight(p, nid) - taskweight;
1227 groupimp = group_weight(p, nid) - groupweight;
1228 if (taskimp < 0 && groupimp < 0)
1229 continue;
1230
1231 env.dst_nid = nid;
1232 update_numa_stats(&env.dst_stats, env.dst_nid);
1233 task_numa_find_cpu(&env, taskimp, groupimp);
1234 }
1235 }
1236
1237 /* No better CPU than the current one was found. */
1238 if (env.best_cpu == -1)
1239 return -EAGAIN;
1240
1241 sched_setnuma(p, env.dst_nid);
1242
1243 /*
1244 * Reset the scan period if the task is being rescheduled on an
1245 * alternative node to recheck if the tasks is now properly placed.
1246 */
1247 p->numa_scan_period = task_scan_min(p);
1248
1249 if (env.best_task == NULL) {
1250 int ret = migrate_task_to(p, env.best_cpu);
1251 return ret;
1252 }
1253
1254 ret = migrate_swap(p, env.best_task);
1255 put_task_struct(env.best_task);
1256 return ret;
1257}
1258
1259/* Attempt to migrate a task to a CPU on the preferred node. */
1260static void numa_migrate_preferred(struct task_struct *p)
1261{
1262 /* This task has no NUMA fault statistics yet */
1263 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1264 return;
1265
1266 /* Periodically retry migrating the task to the preferred node */
1267 p->numa_migrate_retry = jiffies + HZ;
1268
1269 /* Success if task is already running on preferred CPU */
1270 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1271 return;
1272
1273 /* Otherwise, try migrate to a CPU on the preferred node */
1274 task_numa_migrate(p);
1275}
1276
1277/*
1278 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1279 * increments. The more local the fault statistics are, the higher the scan
1280 * period will be for the next scan window. If local/remote ratio is below
1281 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1282 * scan period will decrease
1283 */
1284#define NUMA_PERIOD_SLOTS 10
1285#define NUMA_PERIOD_THRESHOLD 3
1286
1287/*
1288 * Increase the scan period (slow down scanning) if the majority of
1289 * our memory is already on our local node, or if the majority of
1290 * the page accesses are shared with other processes.
1291 * Otherwise, decrease the scan period.
1292 */
1293static void update_task_scan_period(struct task_struct *p,
1294 unsigned long shared, unsigned long private)
1295{
1296 unsigned int period_slot;
1297 int ratio;
1298 int diff;
1299
1300 unsigned long remote = p->numa_faults_locality[0];
1301 unsigned long local = p->numa_faults_locality[1];
1302
1303 /*
1304 * If there were no record hinting faults then either the task is
1305 * completely idle or all activity is areas that are not of interest
1306 * to automatic numa balancing. Scan slower
1307 */
1308 if (local + shared == 0) {
1309 p->numa_scan_period = min(p->numa_scan_period_max,
1310 p->numa_scan_period << 1);
1311
1312 p->mm->numa_next_scan = jiffies +
1313 msecs_to_jiffies(p->numa_scan_period);
836 1314
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */
838 return; 1315 return;
1316 }
1317
1318 /*
1319 * Prepare to scale scan period relative to the current period.
1320 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1321 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1322 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1323 */
1324 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1325 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1326 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1327 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1328 if (!slot)
1329 slot = 1;
1330 diff = slot * period_slot;
1331 } else {
1332 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1333
1334 /*
1335 * Scale scan rate increases based on sharing. There is an
1336 * inverse relationship between the degree of sharing and
1337 * the adjustment made to the scanning period. Broadly
1338 * speaking the intent is that there is little point
1339 * scanning faster if shared accesses dominate as it may
1340 * simply bounce migrations uselessly
1341 */
1342 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1343 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1344 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1345 }
1346
1347 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1348 task_scan_min(p), task_scan_max(p));
1349 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1350}
1351
1352static void task_numa_placement(struct task_struct *p)
1353{
1354 int seq, nid, max_nid = -1, max_group_nid = -1;
1355 unsigned long max_faults = 0, max_group_faults = 0;
1356 unsigned long fault_types[2] = { 0, 0 };
1357 spinlock_t *group_lock = NULL;
1358
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1359 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1360 if (p->numa_scan_seq == seq)
841 return; 1361 return;
842 p->numa_scan_seq = seq; 1362 p->numa_scan_seq = seq;
1363 p->numa_scan_period_max = task_scan_max(p);
1364
1365 /* If the task is part of a group prevent parallel updates to group stats */
1366 if (p->numa_group) {
1367 group_lock = &p->numa_group->lock;
1368 spin_lock(group_lock);
1369 }
1370
1371 /* Find the node with the highest number of faults */
1372 for_each_online_node(nid) {
1373 unsigned long faults = 0, group_faults = 0;
1374 int priv, i;
1375
1376 for (priv = 0; priv < 2; priv++) {
1377 long diff;
1378
1379 i = task_faults_idx(nid, priv);
1380 diff = -p->numa_faults[i];
1381
1382 /* Decay existing window, copy faults since last scan */
1383 p->numa_faults[i] >>= 1;
1384 p->numa_faults[i] += p->numa_faults_buffer[i];
1385 fault_types[priv] += p->numa_faults_buffer[i];
1386 p->numa_faults_buffer[i] = 0;
1387
1388 faults += p->numa_faults[i];
1389 diff += p->numa_faults[i];
1390 p->total_numa_faults += diff;
1391 if (p->numa_group) {
1392 /* safe because we can only change our own group */
1393 p->numa_group->faults[i] += diff;
1394 p->numa_group->total_faults += diff;
1395 group_faults += p->numa_group->faults[i];
1396 }
1397 }
1398
1399 if (faults > max_faults) {
1400 max_faults = faults;
1401 max_nid = nid;
1402 }
1403
1404 if (group_faults > max_group_faults) {
1405 max_group_faults = group_faults;
1406 max_group_nid = nid;
1407 }
1408 }
1409
1410 update_task_scan_period(p, fault_types[0], fault_types[1]);
1411
1412 if (p->numa_group) {
1413 /*
1414 * If the preferred task and group nids are different,
1415 * iterate over the nodes again to find the best place.
1416 */
1417 if (max_nid != max_group_nid) {
1418 unsigned long weight, max_weight = 0;
1419
1420 for_each_online_node(nid) {
1421 weight = task_weight(p, nid) + group_weight(p, nid);
1422 if (weight > max_weight) {
1423 max_weight = weight;
1424 max_nid = nid;
1425 }
1426 }
1427 }
1428
1429 spin_unlock(group_lock);
1430 }
843 1431
844 /* FIXME: Scheduling placement policy hints go here */ 1432 /* Preferred node as the node with the most faults */
1433 if (max_faults && max_nid != p->numa_preferred_nid) {
1434 /* Update the preferred nid and migrate task if possible */
1435 sched_setnuma(p, max_nid);
1436 numa_migrate_preferred(p);
1437 }
1438}
1439
1440static inline int get_numa_group(struct numa_group *grp)
1441{
1442 return atomic_inc_not_zero(&grp->refcount);
1443}
1444
1445static inline void put_numa_group(struct numa_group *grp)
1446{
1447 if (atomic_dec_and_test(&grp->refcount))
1448 kfree_rcu(grp, rcu);
1449}
1450
1451static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1452 int *priv)
1453{
1454 struct numa_group *grp, *my_grp;
1455 struct task_struct *tsk;
1456 bool join = false;
1457 int cpu = cpupid_to_cpu(cpupid);
1458 int i;
1459
1460 if (unlikely(!p->numa_group)) {
1461 unsigned int size = sizeof(struct numa_group) +
1462 2*nr_node_ids*sizeof(unsigned long);
1463
1464 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1465 if (!grp)
1466 return;
1467
1468 atomic_set(&grp->refcount, 1);
1469 spin_lock_init(&grp->lock);
1470 INIT_LIST_HEAD(&grp->task_list);
1471 grp->gid = p->pid;
1472
1473 for (i = 0; i < 2*nr_node_ids; i++)
1474 grp->faults[i] = p->numa_faults[i];
1475
1476 grp->total_faults = p->total_numa_faults;
1477
1478 list_add(&p->numa_entry, &grp->task_list);
1479 grp->nr_tasks++;
1480 rcu_assign_pointer(p->numa_group, grp);
1481 }
1482
1483 rcu_read_lock();
1484 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1485
1486 if (!cpupid_match_pid(tsk, cpupid))
1487 goto no_join;
1488
1489 grp = rcu_dereference(tsk->numa_group);
1490 if (!grp)
1491 goto no_join;
1492
1493 my_grp = p->numa_group;
1494 if (grp == my_grp)
1495 goto no_join;
1496
1497 /*
1498 * Only join the other group if its bigger; if we're the bigger group,
1499 * the other task will join us.
1500 */
1501 if (my_grp->nr_tasks > grp->nr_tasks)
1502 goto no_join;
1503
1504 /*
1505 * Tie-break on the grp address.
1506 */
1507 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1508 goto no_join;
1509
1510 /* Always join threads in the same process. */
1511 if (tsk->mm == current->mm)
1512 join = true;
1513
1514 /* Simple filter to avoid false positives due to PID collisions */
1515 if (flags & TNF_SHARED)
1516 join = true;
1517
1518 /* Update priv based on whether false sharing was detected */
1519 *priv = !join;
1520
1521 if (join && !get_numa_group(grp))
1522 goto no_join;
1523
1524 rcu_read_unlock();
1525
1526 if (!join)
1527 return;
1528
1529 double_lock(&my_grp->lock, &grp->lock);
1530
1531 for (i = 0; i < 2*nr_node_ids; i++) {
1532 my_grp->faults[i] -= p->numa_faults[i];
1533 grp->faults[i] += p->numa_faults[i];
1534 }
1535 my_grp->total_faults -= p->total_numa_faults;
1536 grp->total_faults += p->total_numa_faults;
1537
1538 list_move(&p->numa_entry, &grp->task_list);
1539 my_grp->nr_tasks--;
1540 grp->nr_tasks++;
1541
1542 spin_unlock(&my_grp->lock);
1543 spin_unlock(&grp->lock);
1544
1545 rcu_assign_pointer(p->numa_group, grp);
1546
1547 put_numa_group(my_grp);
1548 return;
1549
1550no_join:
1551 rcu_read_unlock();
1552 return;
1553}
1554
1555void task_numa_free(struct task_struct *p)
1556{
1557 struct numa_group *grp = p->numa_group;
1558 int i;
1559 void *numa_faults = p->numa_faults;
1560
1561 if (grp) {
1562 spin_lock(&grp->lock);
1563 for (i = 0; i < 2*nr_node_ids; i++)
1564 grp->faults[i] -= p->numa_faults[i];
1565 grp->total_faults -= p->total_numa_faults;
1566
1567 list_del(&p->numa_entry);
1568 grp->nr_tasks--;
1569 spin_unlock(&grp->lock);
1570 rcu_assign_pointer(p->numa_group, NULL);
1571 put_numa_group(grp);
1572 }
1573
1574 p->numa_faults = NULL;
1575 p->numa_faults_buffer = NULL;
1576 kfree(numa_faults);
845} 1577}
846 1578
847/* 1579/*
848 * Got a PROT_NONE fault for a page on @node. 1580 * Got a PROT_NONE fault for a page on @node.
849 */ 1581 */
850void task_numa_fault(int node, int pages, bool migrated) 1582void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1583{
852 struct task_struct *p = current; 1584 struct task_struct *p = current;
1585 bool migrated = flags & TNF_MIGRATED;
1586 int priv;
853 1587
854 if (!numabalancing_enabled) 1588 if (!numabalancing_enabled)
855 return; 1589 return;
856 1590
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1591 /* for example, ksmd faulting in a user's mm */
1592 if (!p->mm)
1593 return;
1594
1595 /* Do not worry about placement if exiting */
1596 if (p->state == TASK_DEAD)
1597 return;
1598
1599 /* Allocate buffer to track faults on a per-node basis */
1600 if (unlikely(!p->numa_faults)) {
1601 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1602
1603 /* numa_faults and numa_faults_buffer share the allocation */
1604 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1605 if (!p->numa_faults)
1606 return;
1607
1608 BUG_ON(p->numa_faults_buffer);
1609 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1610 p->total_numa_faults = 0;
1611 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1612 }
858 1613
859 /* 1614 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1615 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1616 * to be private if the accessing pid has not changed
862 */ 1617 */
863 if (!migrated) 1618 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1619 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1620 } else {
1621 priv = cpupid_match_pid(p, last_cpupid);
1622 if (!priv && !(flags & TNF_NO_GROUP))
1623 task_numa_group(p, last_cpupid, flags, &priv);
1624 }
866 1625
867 task_numa_placement(p); 1626 task_numa_placement(p);
1627
1628 /*
1629 * Retry task to preferred node migration periodically, in case it
1630 * case it previously failed, or the scheduler moved us.
1631 */
1632 if (time_after(jiffies, p->numa_migrate_retry))
1633 numa_migrate_preferred(p);
1634
1635 if (migrated)
1636 p->numa_pages_migrated += pages;
1637
1638 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1639 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1640}
869 1641
870static void reset_ptenuma_scan(struct task_struct *p) 1642static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1656 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1657 struct vm_area_struct *vma;
886 unsigned long start, end; 1658 unsigned long start, end;
1659 unsigned long nr_pte_updates = 0;
887 long pages; 1660 long pages;
888 1661
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1662 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1673 if (p->flags & PF_EXITING)
901 return; 1674 return;
902 1675
903 /* 1676 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1677 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1678 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1679 }
933 1680
934 /* 1681 /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1685 if (time_before(now, migrate))
939 return; 1686 return;
940 1687
941 if (p->numa_scan_period == 0) 1688 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1689 p->numa_scan_period_max = task_scan_max(p);
1690 p->numa_scan_period = task_scan_min(p);
1691 }
943 1692
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1693 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1694 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1695 return;
947 1696
948 /* 1697 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1698 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1699 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1700 */
953 if (migrate_ratelimited(numa_node_id())) 1701 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1702
956 start = mm->numa_scan_offset; 1703 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1704 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1714 vma = mm->mmap;
968 } 1715 }
969 for (; vma; vma = vma->vm_next) { 1716 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1717 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1718 continue;
972 1719
973 /* Skip small VMAs. They are not likely to be of relevance */ 1720 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1721 * Shared library pages mapped by multiple processes are not
1722 * migrated as it is expected they are cache replicated. Avoid
1723 * hinting faults in read-only file-backed mappings or the vdso
1724 * as migrating the pages will be of marginal benefit.
1725 */
1726 if (!vma->vm_mm ||
1727 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
975 continue; 1728 continue;
976 1729
977 do { 1730 do {
978 start = max(start, vma->vm_start); 1731 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1732 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1733 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1734 nr_pte_updates += change_prot_numa(vma, start, end);
1735
1736 /*
1737 * Scan sysctl_numa_balancing_scan_size but ensure that
1738 * at least one PTE is updated so that unused virtual
1739 * address space is quickly skipped.
1740 */
1741 if (nr_pte_updates)
1742 pages -= (end - start) >> PAGE_SHIFT;
982 1743
983 start = end; 1744 start = end;
984 if (pages <= 0) 1745 if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
988 1749
989out: 1750out:
990 /* 1751 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1752 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1753 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1754 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1755 * scanner to the start so check it now.
995 */ 1756 */
996 if (vma) 1757 if (vma)
997 mm->numa_scan_offset = start; 1758 mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1786
1026 if (now - curr->node_stamp > period) { 1787 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1788 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1789 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1790 curr->node_stamp += period;
1030 1791
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1792 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1793 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1799static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1800{
1040} 1801}
1802
1803static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1804{
1805}
1806
1807static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1808{
1809}
1041#endif /* CONFIG_NUMA_BALANCING */ 1810#endif /* CONFIG_NUMA_BALANCING */
1042 1811
1043static void 1812static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1816 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1817 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1818#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1819 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1820 struct rq *rq = rq_of(cfs_rq);
1821
1822 account_numa_enqueue(rq, task_of(se));
1823 list_add(&se->group_node, &rq->cfs_tasks);
1824 }
1052#endif 1825#endif
1053 cfs_rq->nr_running++; 1826 cfs_rq->nr_running++;
1054} 1827}
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1832 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1833 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1834 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1835 if (entity_is_task(se)) {
1836 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1837 list_del_init(&se->group_node);
1838 }
1064 cfs_rq->nr_running--; 1839 cfs_rq->nr_running--;
1065} 1840}
1066 1841
@@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
2070 return static_key_false(&__cfs_bandwidth_used); 2845 return static_key_false(&__cfs_bandwidth_used);
2071} 2846}
2072 2847
2073void account_cfs_bandwidth_used(int enabled, int was_enabled) 2848void cfs_bandwidth_usage_inc(void)
2074{ 2849{
2075 /* only need to count groups transitioning between enabled/!enabled */ 2850 static_key_slow_inc(&__cfs_bandwidth_used);
2076 if (enabled && !was_enabled) 2851}
2077 static_key_slow_inc(&__cfs_bandwidth_used); 2852
2078 else if (!enabled && was_enabled) 2853void cfs_bandwidth_usage_dec(void)
2079 static_key_slow_dec(&__cfs_bandwidth_used); 2854{
2855 static_key_slow_dec(&__cfs_bandwidth_used);
2080} 2856}
2081#else /* HAVE_JUMP_LABEL */ 2857#else /* HAVE_JUMP_LABEL */
2082static bool cfs_bandwidth_used(void) 2858static bool cfs_bandwidth_used(void)
@@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
2084 return true; 2860 return true;
2085} 2861}
2086 2862
2087void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 2863void cfs_bandwidth_usage_inc(void) {}
2864void cfs_bandwidth_usage_dec(void) {}
2088#endif /* HAVE_JUMP_LABEL */ 2865#endif /* HAVE_JUMP_LABEL */
2089 2866
2090/* 2867/*
@@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2335 cfs_rq->throttled_clock = rq_clock(rq); 3112 cfs_rq->throttled_clock = rq_clock(rq);
2336 raw_spin_lock(&cfs_b->lock); 3113 raw_spin_lock(&cfs_b->lock);
2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3114 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3115 if (!cfs_b->timer_active)
3116 __start_cfs_bandwidth(cfs_b);
2338 raw_spin_unlock(&cfs_b->lock); 3117 raw_spin_unlock(&cfs_b->lock);
2339} 3118}
2340 3119
@@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2448 if (idle) 3227 if (idle)
2449 goto out_unlock; 3228 goto out_unlock;
2450 3229
3230 /*
3231 * if we have relooped after returning idle once, we need to update our
3232 * status as actually running, so that other cpus doing
3233 * __start_cfs_bandwidth will stop trying to cancel us.
3234 */
3235 cfs_b->timer_active = 1;
3236
2451 __refill_cfs_bandwidth_runtime(cfs_b); 3237 __refill_cfs_bandwidth_runtime(cfs_b);
2452 3238
2453 if (!throttled) { 3239 if (!throttled) {
@@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508/* how long we wait to gather additional slack before distributing */ 3294/* how long we wait to gather additional slack before distributing */
2509static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 3295static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510 3296
2511/* are we near the end of the current quota period? */ 3297/*
3298 * Are we near the end of the current quota period?
3299 *
3300 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3301 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3302 * migrate_hrtimers, base is never cleared, so we are fine.
3303 */
2512static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 3304static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513{ 3305{
2514 struct hrtimer *refresh_timer = &cfs_b->period_timer; 3306 struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2584 u64 expires; 3376 u64 expires;
2585 3377
2586 /* confirm we're still not at a refresh boundary */ 3378 /* confirm we're still not at a refresh boundary */
2587 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) 3379 raw_spin_lock(&cfs_b->lock);
3380 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3381 raw_spin_unlock(&cfs_b->lock);
2588 return; 3382 return;
3383 }
2589 3384
2590 raw_spin_lock(&cfs_b->lock);
2591 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3385 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 runtime = cfs_b->runtime; 3386 runtime = cfs_b->runtime;
2593 cfs_b->runtime = 0; 3387 cfs_b->runtime = 0;
@@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2708 * (timer_active==0 becomes visible before the hrtimer call-back 3502 * (timer_active==0 becomes visible before the hrtimer call-back
2709 * terminates). In either case we ensure that it's re-programmed 3503 * terminates). In either case we ensure that it's re-programmed
2710 */ 3504 */
2711 while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 3505 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3506 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3507 /* bounce the lock to allow do_sched_cfs_period_timer to run */
2712 raw_spin_unlock(&cfs_b->lock); 3508 raw_spin_unlock(&cfs_b->lock);
2713 /* ensure cfs_b->lock is available while we wait */ 3509 cpu_relax();
2714 hrtimer_cancel(&cfs_b->period_timer);
2715
2716 raw_spin_lock(&cfs_b->lock); 3510 raw_spin_lock(&cfs_b->lock);
2717 /* if someone else restarted the timer then we're done */ 3511 /* if someone else restarted the timer then we're done */
2718 if (cfs_b->timer_active) 3512 if (cfs_b->timer_active)
@@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3113{ 3907{
3114 struct sched_entity *se = tg->se[cpu]; 3908 struct sched_entity *se = tg->se[cpu];
3115 3909
3116 if (!tg->parent) /* the trivial, non-cgroup case */ 3910 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3117 return wl; 3911 return wl;
3118 3912
3119 for_each_sched_entity(se) { 3913 for_each_sched_entity(se) {
@@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3166} 3960}
3167#else 3961#else
3168 3962
3169static inline unsigned long effective_load(struct task_group *tg, int cpu, 3963static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3170 unsigned long wl, unsigned long wg)
3171{ 3964{
3172 return wl; 3965 return wl;
3173} 3966}
@@ -3420,11 +4213,10 @@ done:
3420 * preempt must be disabled. 4213 * preempt must be disabled.
3421 */ 4214 */
3422static int 4215static int
3423select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4216select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3424{ 4217{
3425 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4218 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 int cpu = smp_processor_id(); 4219 int cpu = smp_processor_id();
3427 int prev_cpu = task_cpu(p);
3428 int new_cpu = cpu; 4220 int new_cpu = cpu;
3429 int want_affine = 0; 4221 int want_affine = 0;
3430 int sync = wake_flags & WF_SYNC; 4222 int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3904 4696
3905static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4697static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 4698
4699enum fbq_type { regular, remote, all };
4700
3907#define LBF_ALL_PINNED 0x01 4701#define LBF_ALL_PINNED 0x01
3908#define LBF_NEED_BREAK 0x02 4702#define LBF_NEED_BREAK 0x02
3909#define LBF_SOME_PINNED 0x04 4703#define LBF_DST_PINNED 0x04
4704#define LBF_SOME_PINNED 0x08
3910 4705
3911struct lb_env { 4706struct lb_env {
3912 struct sched_domain *sd; 4707 struct sched_domain *sd;
@@ -3929,6 +4724,8 @@ struct lb_env {
3929 unsigned int loop; 4724 unsigned int loop;
3930 unsigned int loop_break; 4725 unsigned int loop_break;
3931 unsigned int loop_max; 4726 unsigned int loop_max;
4727
4728 enum fbq_type fbq_type;
3932}; 4729};
3933 4730
3934/* 4731/*
@@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3975 return delta < (s64)sysctl_sched_migration_cost; 4772 return delta < (s64)sysctl_sched_migration_cost;
3976} 4773}
3977 4774
4775#ifdef CONFIG_NUMA_BALANCING
4776/* Returns true if the destination node has incurred more faults */
4777static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4778{
4779 int src_nid, dst_nid;
4780
4781 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4782 !(env->sd->flags & SD_NUMA)) {
4783 return false;
4784 }
4785
4786 src_nid = cpu_to_node(env->src_cpu);
4787 dst_nid = cpu_to_node(env->dst_cpu);
4788
4789 if (src_nid == dst_nid)
4790 return false;
4791
4792 /* Always encourage migration to the preferred node. */
4793 if (dst_nid == p->numa_preferred_nid)
4794 return true;
4795
4796 /* If both task and group weight improve, this move is a winner. */
4797 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4798 group_weight(p, dst_nid) > group_weight(p, src_nid))
4799 return true;
4800
4801 return false;
4802}
4803
4804
4805static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4806{
4807 int src_nid, dst_nid;
4808
4809 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4810 return false;
4811
4812 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4813 return false;
4814
4815 src_nid = cpu_to_node(env->src_cpu);
4816 dst_nid = cpu_to_node(env->dst_cpu);
4817
4818 if (src_nid == dst_nid)
4819 return false;
4820
4821 /* Migrating away from the preferred node is always bad. */
4822 if (src_nid == p->numa_preferred_nid)
4823 return true;
4824
4825 /* If either task or group weight get worse, don't do it. */
4826 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4827 group_weight(p, dst_nid) < group_weight(p, src_nid))
4828 return true;
4829
4830 return false;
4831}
4832
4833#else
4834static inline bool migrate_improves_locality(struct task_struct *p,
4835 struct lb_env *env)
4836{
4837 return false;
4838}
4839
4840static inline bool migrate_degrades_locality(struct task_struct *p,
4841 struct lb_env *env)
4842{
4843 return false;
4844}
4845#endif
4846
3978/* 4847/*
3979 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4848 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980 */ 4849 */
@@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3997 4866
3998 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4867 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 4868
4869 env->flags |= LBF_SOME_PINNED;
4870
4000 /* 4871 /*
4001 * Remember if this task can be migrated to any other cpu in 4872 * Remember if this task can be migrated to any other cpu in
4002 * our sched_group. We may want to revisit it if we couldn't 4873 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4005 * Also avoid computing new_dst_cpu if we have already computed 4876 * Also avoid computing new_dst_cpu if we have already computed
4006 * one in current iteration. 4877 * one in current iteration.
4007 */ 4878 */
4008 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4879 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4009 return 0; 4880 return 0;
4010 4881
4011 /* Prevent to re-select dst_cpu via env's cpus */ 4882 /* Prevent to re-select dst_cpu via env's cpus */
4012 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4883 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4884 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 env->flags |= LBF_SOME_PINNED; 4885 env->flags |= LBF_DST_PINNED;
4015 env->new_dst_cpu = cpu; 4886 env->new_dst_cpu = cpu;
4016 break; 4887 break;
4017 } 4888 }
@@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4030 4901
4031 /* 4902 /*
4032 * Aggressive migration if: 4903 * Aggressive migration if:
4033 * 1) task is cache cold, or 4904 * 1) destination numa is preferred
4034 * 2) too many balance attempts have failed. 4905 * 2) task is cache cold, or
4906 * 3) too many balance attempts have failed.
4035 */ 4907 */
4036
4037 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4908 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4909 if (!tsk_cache_hot)
4910 tsk_cache_hot = migrate_degrades_locality(p, env);
4911
4912 if (migrate_improves_locality(p, env)) {
4913#ifdef CONFIG_SCHEDSTATS
4914 if (tsk_cache_hot) {
4915 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4916 schedstat_inc(p, se.statistics.nr_forced_migrations);
4917 }
4918#endif
4919 return 1;
4920 }
4921
4038 if (!tsk_cache_hot || 4922 if (!tsk_cache_hot ||
4039 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4923 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 4924
@@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
4077 return 0; 4961 return 0;
4078} 4962}
4079 4963
4080static unsigned long task_h_load(struct task_struct *p);
4081
4082static const unsigned int sched_nr_migrate_break = 32; 4964static const unsigned int sched_nr_migrate_break = 32;
4083 4965
4084/* 4966/*
@@ -4291,6 +5173,10 @@ struct sg_lb_stats {
4291 unsigned int group_weight; 5173 unsigned int group_weight;
4292 int group_imb; /* Is there an imbalance in the group ? */ 5174 int group_imb; /* Is there an imbalance in the group ? */
4293 int group_has_capacity; /* Is there extra capacity in the group? */ 5175 int group_has_capacity; /* Is there extra capacity in the group? */
5176#ifdef CONFIG_NUMA_BALANCING
5177 unsigned int nr_numa_running;
5178 unsigned int nr_preferred_running;
5179#endif
4294}; 5180};
4295 5181
4296/* 5182/*
@@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4330/** 5216/**
4331 * get_sd_load_idx - Obtain the load index for a given sched domain. 5217 * get_sd_load_idx - Obtain the load index for a given sched domain.
4332 * @sd: The sched_domain whose load_idx is to be obtained. 5218 * @sd: The sched_domain whose load_idx is to be obtained.
4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 5219 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
4334 * 5220 *
4335 * Return: The load index. 5221 * Return: The load index.
4336 */ 5222 */
@@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4447{ 5333{
4448 struct sched_domain *child = sd->child; 5334 struct sched_domain *child = sd->child;
4449 struct sched_group *group, *sdg = sd->groups; 5335 struct sched_group *group, *sdg = sd->groups;
4450 unsigned long power; 5336 unsigned long power, power_orig;
4451 unsigned long interval; 5337 unsigned long interval;
4452 5338
4453 interval = msecs_to_jiffies(sd->balance_interval); 5339 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4459 return; 5345 return;
4460 } 5346 }
4461 5347
4462 power = 0; 5348 power_orig = power = 0;
4463 5349
4464 if (child->flags & SD_OVERLAP) { 5350 if (child->flags & SD_OVERLAP) {
4465 /* 5351 /*
@@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
4467 * span the current group. 5353 * span the current group.
4468 */ 5354 */
4469 5355
4470 for_each_cpu(cpu, sched_group_cpus(sdg)) 5356 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4471 power += power_of(cpu); 5357 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
5358
5359 power_orig += sg->sgp->power_orig;
5360 power += sg->sgp->power;
5361 }
4472 } else { 5362 } else {
4473 /* 5363 /*
4474 * !SD_OVERLAP domains can assume that child groups 5364 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4477 5367
4478 group = child->groups; 5368 group = child->groups;
4479 do { 5369 do {
5370 power_orig += group->sgp->power_orig;
4480 power += group->sgp->power; 5371 power += group->sgp->power;
4481 group = group->next; 5372 group = group->next;
4482 } while (group != child->groups); 5373 } while (group != child->groups);
4483 } 5374 }
4484 5375
4485 sdg->sgp->power_orig = sdg->sgp->power = power; 5376 sdg->sgp->power_orig = power_orig;
5377 sdg->sgp->power = power;
4486} 5378}
4487 5379
4488/* 5380/*
@@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4526 * cpu 3 and leave one of the cpus in the second group unused. 5418 * cpu 3 and leave one of the cpus in the second group unused.
4527 * 5419 *
4528 * The current solution to this issue is detecting the skew in the first group 5420 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are 5421 * by noticing the lower domain failed to reach balance and had difficulty
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see 5422 * moving tasks due to affinity constraints.
4531 * sg_imbalanced().
4532 * 5423 *
4533 * When this is so detected; this group becomes a candidate for busiest; see 5424 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and 5425 * update_sd_pick_busiest(). And calculate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it 5426 * find_busiest_group() avoid some of the usual balance conditions to allow it
4536 * to create an effective group imbalance. 5427 * to create an effective group imbalance.
4537 * 5428 *
4538 * This is a somewhat tricky proposition since the next run might not find the 5429 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4540 * subtle and fragile situation. 5431 * subtle and fragile situation.
4541 */ 5432 */
4542 5433
4543struct sg_imb_stats { 5434static inline int sg_imbalanced(struct sched_group *group)
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{ 5435{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL; 5436 return group->sgp->imbalance;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552} 5437}
4553 5438
4554static inline void 5439/*
4555update_sg_imb_stats(struct sg_imb_stats *sgi, 5440 * Compute the group capacity.
4556 unsigned long load, unsigned long nr_running) 5441 *
5442 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5443 * first dividing out the smt factor and computing the actual number of cores
5444 * and limit power unit capacity with that.
5445 */
5446static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4557{ 5447{
4558 if (load > sgi->max_cpu_load) 5448 unsigned int capacity, smt, cpus;
4559 sgi->max_cpu_load = load; 5449 unsigned int power, power_orig;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562 5450
4563 if (nr_running > sgi->max_nr_running) 5451 power = group->sgp->power;
4564 sgi->max_nr_running = nr_running; 5452 power_orig = group->sgp->power_orig;
4565 if (sgi->min_nr_running > nr_running) 5453 cpus = group->group_weight;
4566 sgi->min_nr_running = nr_running;
4567}
4568 5454
4569static inline int 5455 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) 5456 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4571{ 5457 capacity = cpus / smt; /* cores */
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584 5458
4585 return 0; 5459 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5460 if (!capacity)
5461 capacity = fix_small_capacity(env->sd, group);
5462
5463 return capacity;
4586} 5464}
4587 5465
4588/** 5466/**
@@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4597 struct sched_group *group, int load_idx, 5475 struct sched_group *group, int load_idx,
4598 int local_group, struct sg_lb_stats *sgs) 5476 int local_group, struct sg_lb_stats *sgs)
4599{ 5477{
4600 struct sg_imb_stats sgi;
4601 unsigned long nr_running; 5478 unsigned long nr_running;
4602 unsigned long load; 5479 unsigned long load;
4603 int i; 5480 int i;
4604 5481
4605 init_sg_imb_stats(&sgi); 5482 memset(sgs, 0, sizeof(*sgs));
4606 5483
4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5484 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 struct rq *rq = cpu_rq(i); 5485 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4610 nr_running = rq->nr_running; 5487 nr_running = rq->nr_running;
4611 5488
4612 /* Bias balancing toward cpus of our domain */ 5489 /* Bias balancing toward cpus of our domain */
4613 if (local_group) { 5490 if (local_group)
4614 load = target_load(i, load_idx); 5491 load = target_load(i, load_idx);
4615 } else { 5492 else
4616 load = source_load(i, load_idx); 5493 load = source_load(i, load_idx);
4617 update_sg_imb_stats(&sgi, load, nr_running);
4618 }
4619 5494
4620 sgs->group_load += load; 5495 sgs->group_load += load;
4621 sgs->sum_nr_running += nr_running; 5496 sgs->sum_nr_running += nr_running;
5497#ifdef CONFIG_NUMA_BALANCING
5498 sgs->nr_numa_running += rq->nr_numa_running;
5499 sgs->nr_preferred_running += rq->nr_preferred_running;
5500#endif
4622 sgs->sum_weighted_load += weighted_cpuload(i); 5501 sgs->sum_weighted_load += weighted_cpuload(i);
4623 if (idle_cpu(i)) 5502 if (idle_cpu(i))
4624 sgs->idle_cpus++; 5503 sgs->idle_cpus++;
4625 } 5504 }
4626 5505
4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 time_after_eq(jiffies, group->sgp->next_update)))
4629 update_group_power(env->sd, env->dst_cpu);
4630
4631 /* Adjust by relative CPU power of the group */ 5506 /* Adjust by relative CPU power of the group */
4632 sgs->group_power = group->sgp->power; 5507 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5508 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4635 if (sgs->sum_nr_running) 5510 if (sgs->sum_nr_running)
4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5511 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 5512
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639
4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642
4643 if (!sgs->group_capacity)
4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4646 sgs->group_weight = group->group_weight; 5513 sgs->group_weight = group->group_weight;
4647 5514
5515 sgs->group_imb = sg_imbalanced(group);
5516 sgs->group_capacity = sg_capacity(env, group);
5517
4648 if (sgs->group_capacity > sgs->sum_nr_running) 5518 if (sgs->group_capacity > sgs->sum_nr_running)
4649 sgs->group_has_capacity = 1; 5519 sgs->group_has_capacity = 1;
4650} 5520}
@@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4693 return false; 5563 return false;
4694} 5564}
4695 5565
5566#ifdef CONFIG_NUMA_BALANCING
5567static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5568{
5569 if (sgs->sum_nr_running > sgs->nr_numa_running)
5570 return regular;
5571 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5572 return remote;
5573 return all;
5574}
5575
5576static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5577{
5578 if (rq->nr_running > rq->nr_numa_running)
5579 return regular;
5580 if (rq->nr_running > rq->nr_preferred_running)
5581 return remote;
5582 return all;
5583}
5584#else
5585static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5586{
5587 return all;
5588}
5589
5590static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5591{
5592 return regular;
5593}
5594#endif /* CONFIG_NUMA_BALANCING */
5595
4696/** 5596/**
4697 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5597 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698 * @env: The load balancing environment. 5598 * @env: The load balancing environment.
4699 * @balance: Should we balance.
4700 * @sds: variable to hold the statistics for this sched_domain. 5599 * @sds: variable to hold the statistics for this sched_domain.
4701 */ 5600 */
4702static inline void update_sd_lb_stats(struct lb_env *env, 5601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4703 struct sd_lb_stats *sds)
4704{ 5602{
4705 struct sched_domain *child = env->sd->child; 5603 struct sched_domain *child = env->sd->child;
4706 struct sched_group *sg = env->sd->groups; 5604 struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4720 if (local_group) { 5618 if (local_group) {
4721 sds->local = sg; 5619 sds->local = sg;
4722 sgs = &sds->local_stat; 5620 sgs = &sds->local_stat;
5621
5622 if (env->idle != CPU_NEWLY_IDLE ||
5623 time_after_eq(jiffies, sg->sgp->next_update))
5624 update_group_power(env->sd, env->dst_cpu);
4723 } 5625 }
4724 5626
4725 memset(sgs, 0, sizeof(*sgs));
4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 5627 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 5628
5629 if (local_group)
5630 goto next_group;
5631
4728 /* 5632 /*
4729 * In case the child domain prefers tasks go to siblings 5633 * In case the child domain prefers tasks go to siblings
4730 * first, lower the sg capacity to one so that we'll try 5634 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4735 * heaviest group when it is already under-utilized (possible 5639 * heaviest group when it is already under-utilized (possible
4736 * with a large weight task outweighs the tasks on the system). 5640 * with a large weight task outweighs the tasks on the system).
4737 */ 5641 */
4738 if (prefer_sibling && !local_group && 5642 if (prefer_sibling && sds->local &&
4739 sds->local && sds->local_stat.group_has_capacity) 5643 sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U); 5644 sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 5645
4742 /* Now, start updating sd_lb_stats */ 5646 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4743 sds->total_load += sgs->group_load;
4744 sds->total_pwr += sgs->group_power;
4745
4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 sds->busiest = sg; 5647 sds->busiest = sg;
4748 sds->busiest_stat = *sgs; 5648 sds->busiest_stat = *sgs;
4749 } 5649 }
4750 5650
5651next_group:
5652 /* Now, start updating sd_lb_stats */
5653 sds->total_load += sgs->group_load;
5654 sds->total_pwr += sgs->group_power;
5655
4751 sg = sg->next; 5656 sg = sg->next;
4752 } while (sg != env->sd->groups); 5657 } while (sg != env->sd->groups);
5658
5659 if (env->sd->flags & SD_NUMA)
5660 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4753} 5661}
4754 5662
4755/** 5663/**
@@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5053 int i; 5961 int i;
5054 5962
5055 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5963 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5056 unsigned long power = power_of(i); 5964 unsigned long power, capacity, wl;
5057 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5965 enum fbq_type rt;
5058 SCHED_POWER_SCALE);
5059 unsigned long wl;
5060 5966
5967 rq = cpu_rq(i);
5968 rt = fbq_classify_rq(rq);
5969
5970 /*
5971 * We classify groups/runqueues into three groups:
5972 * - regular: there are !numa tasks
5973 * - remote: there are numa tasks that run on the 'wrong' node
5974 * - all: there is no distinction
5975 *
5976 * In order to avoid migrating ideally placed numa tasks,
5977 * ignore those when there's better options.
5978 *
5979 * If we ignore the actual busiest queue to migrate another
5980 * task, the next balance pass can still reduce the busiest
5981 * queue by moving tasks around inside the node.
5982 *
5983 * If we cannot move enough load due to this classification
5984 * the next pass will adjust the group classification and
5985 * allow migration of more tasks.
5986 *
5987 * Both cases only affect the total convergence complexity.
5988 */
5989 if (rt > env->fbq_type)
5990 continue;
5991
5992 power = power_of(i);
5993 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5061 if (!capacity) 5994 if (!capacity)
5062 capacity = fix_small_capacity(env->sd, group); 5995 capacity = fix_small_capacity(env->sd, group);
5063 5996
5064 rq = cpu_rq(i);
5065 wl = weighted_cpuload(i); 5997 wl = weighted_cpuload(i);
5066 5998
5067 /* 5999 /*
@@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5164 int *continue_balancing) 6096 int *continue_balancing)
5165{ 6097{
5166 int ld_moved, cur_ld_moved, active_balance = 0; 6098 int ld_moved, cur_ld_moved, active_balance = 0;
6099 struct sched_domain *sd_parent = sd->parent;
5167 struct sched_group *group; 6100 struct sched_group *group;
5168 struct rq *busiest; 6101 struct rq *busiest;
5169 unsigned long flags; 6102 unsigned long flags;
@@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5177 .idle = idle, 6110 .idle = idle,
5178 .loop_break = sched_nr_migrate_break, 6111 .loop_break = sched_nr_migrate_break,
5179 .cpus = cpus, 6112 .cpus = cpus,
6113 .fbq_type = all,
5180 }; 6114 };
5181 6115
5182 /* 6116 /*
@@ -5268,17 +6202,17 @@ more_balance:
5268 * moreover subsequent load balance cycles should correct the 6202 * moreover subsequent load balance cycles should correct the
5269 * excess load moved. 6203 * excess load moved.
5270 */ 6204 */
5271 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6205 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6206
6207 /* Prevent to re-select dst_cpu via env's cpus */
6208 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5272 6209
5273 env.dst_rq = cpu_rq(env.new_dst_cpu); 6210 env.dst_rq = cpu_rq(env.new_dst_cpu);
5274 env.dst_cpu = env.new_dst_cpu; 6211 env.dst_cpu = env.new_dst_cpu;
5275 env.flags &= ~LBF_SOME_PINNED; 6212 env.flags &= ~LBF_DST_PINNED;
5276 env.loop = 0; 6213 env.loop = 0;
5277 env.loop_break = sched_nr_migrate_break; 6214 env.loop_break = sched_nr_migrate_break;
5278 6215
5279 /* Prevent to re-select dst_cpu via env's cpus */
5280 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5281
5282 /* 6216 /*
5283 * Go back to "more_balance" rather than "redo" since we 6217 * Go back to "more_balance" rather than "redo" since we
5284 * need to continue with same src_cpu. 6218 * need to continue with same src_cpu.
@@ -5286,6 +6220,18 @@ more_balance:
5286 goto more_balance; 6220 goto more_balance;
5287 } 6221 }
5288 6222
6223 /*
6224 * We failed to reach balance because of affinity.
6225 */
6226 if (sd_parent) {
6227 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6228
6229 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6230 *group_imbalance = 1;
6231 } else if (*group_imbalance)
6232 *group_imbalance = 0;
6233 }
6234
5289 /* All tasks on this runqueue were pinned by CPU affinity */ 6235 /* All tasks on this runqueue were pinned by CPU affinity */
5290 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6236 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5291 cpumask_clear_cpu(cpu_of(busiest), cpus); 6237 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5393 struct sched_domain *sd; 6339 struct sched_domain *sd;
5394 int pulled_task = 0; 6340 int pulled_task = 0;
5395 unsigned long next_balance = jiffies + HZ; 6341 unsigned long next_balance = jiffies + HZ;
6342 u64 curr_cost = 0;
5396 6343
5397 this_rq->idle_stamp = rq_clock(this_rq); 6344 this_rq->idle_stamp = rq_clock(this_rq);
5398 6345
@@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5409 for_each_domain(this_cpu, sd) { 6356 for_each_domain(this_cpu, sd) {
5410 unsigned long interval; 6357 unsigned long interval;
5411 int continue_balancing = 1; 6358 int continue_balancing = 1;
6359 u64 t0, domain_cost;
5412 6360
5413 if (!(sd->flags & SD_LOAD_BALANCE)) 6361 if (!(sd->flags & SD_LOAD_BALANCE))
5414 continue; 6362 continue;
5415 6363
6364 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6365 break;
6366
5416 if (sd->flags & SD_BALANCE_NEWIDLE) { 6367 if (sd->flags & SD_BALANCE_NEWIDLE) {
6368 t0 = sched_clock_cpu(this_cpu);
6369
5417 /* If we've pulled tasks over stop searching: */ 6370 /* If we've pulled tasks over stop searching: */
5418 pulled_task = load_balance(this_cpu, this_rq, 6371 pulled_task = load_balance(this_cpu, this_rq,
5419 sd, CPU_NEWLY_IDLE, 6372 sd, CPU_NEWLY_IDLE,
5420 &continue_balancing); 6373 &continue_balancing);
6374
6375 domain_cost = sched_clock_cpu(this_cpu) - t0;
6376 if (domain_cost > sd->max_newidle_lb_cost)
6377 sd->max_newidle_lb_cost = domain_cost;
6378
6379 curr_cost += domain_cost;
5421 } 6380 }
5422 6381
5423 interval = msecs_to_jiffies(sd->balance_interval); 6382 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5439 */ 6398 */
5440 this_rq->next_balance = next_balance; 6399 this_rq->next_balance = next_balance;
5441 } 6400 }
6401
6402 if (curr_cost > this_rq->max_idle_balance_cost)
6403 this_rq->max_idle_balance_cost = curr_cost;
5442} 6404}
5443 6405
5444/* 6406/*
@@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5662 /* Earliest time when we have to do rebalance again */ 6624 /* Earliest time when we have to do rebalance again */
5663 unsigned long next_balance = jiffies + 60*HZ; 6625 unsigned long next_balance = jiffies + 60*HZ;
5664 int update_next_balance = 0; 6626 int update_next_balance = 0;
5665 int need_serialize; 6627 int need_serialize, need_decay = 0;
6628 u64 max_cost = 0;
5666 6629
5667 update_blocked_averages(cpu); 6630 update_blocked_averages(cpu);
5668 6631
5669 rcu_read_lock(); 6632 rcu_read_lock();
5670 for_each_domain(cpu, sd) { 6633 for_each_domain(cpu, sd) {
6634 /*
6635 * Decay the newidle max times here because this is a regular
6636 * visit to all the domains. Decay ~1% per second.
6637 */
6638 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6639 sd->max_newidle_lb_cost =
6640 (sd->max_newidle_lb_cost * 253) / 256;
6641 sd->next_decay_max_lb_cost = jiffies + HZ;
6642 need_decay = 1;
6643 }
6644 max_cost += sd->max_newidle_lb_cost;
6645
5671 if (!(sd->flags & SD_LOAD_BALANCE)) 6646 if (!(sd->flags & SD_LOAD_BALANCE))
5672 continue; 6647 continue;
5673 6648
6649 /*
6650 * Stop the load balance at this level. There is another
6651 * CPU in our sched group which is doing load balancing more
6652 * actively.
6653 */
6654 if (!continue_balancing) {
6655 if (need_decay)
6656 continue;
6657 break;
6658 }
6659
5674 interval = sd->balance_interval; 6660 interval = sd->balance_interval;
5675 if (idle != CPU_IDLE) 6661 if (idle != CPU_IDLE)
5676 interval *= sd->busy_factor; 6662 interval *= sd->busy_factor;
@@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5689 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6675 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5690 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 6676 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5691 /* 6677 /*
5692 * The LBF_SOME_PINNED logic could have changed 6678 * The LBF_DST_PINNED logic could have changed
5693 * env->dst_cpu, so we can't know our idle 6679 * env->dst_cpu, so we can't know our idle
5694 * state even if we migrated tasks. Update it. 6680 * state even if we migrated tasks. Update it.
5695 */ 6681 */
@@ -5704,14 +6690,14 @@ out:
5704 next_balance = sd->last_balance + interval; 6690 next_balance = sd->last_balance + interval;
5705 update_next_balance = 1; 6691 update_next_balance = 1;
5706 } 6692 }
5707 6693 }
6694 if (need_decay) {
5708 /* 6695 /*
5709 * Stop the load balance at this level. There is another 6696 * Ensure the rq-wide value also decays but keep it at a
5710 * CPU in our sched group which is doing load balancing more 6697 * reasonable floor to avoid funnies with rq->avg_idle.
5711 * actively.
5712 */ 6698 */
5713 if (!continue_balancing) 6699 rq->max_idle_balance_cost =
5714 break; 6700 max((u64)sysctl_sched_migration_cost, max_cost);
5715 } 6701 }
5716 rcu_read_unlock(); 6702 rcu_read_unlock();
5717 6703
@@ -6214,7 +7200,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6214 se->cfs_rq = parent->my_q; 7200 se->cfs_rq = parent->my_q;
6215 7201
6216 se->my_q = cfs_rq; 7202 se->my_q = cfs_rq;
6217 update_load_set(&se->load, 0); 7203 /* guarantee group entities always have weight */
7204 update_load_set(&se->load, NICE_0_LOAD);
6218 se->parent = parent; 7205 se->parent = parent;
6219} 7206}
6220 7207
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false) 70
71/*
72 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
73 * higher number of hinting faults are recorded during active load
74 * balancing.
75 */
76SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
77
78/*
79 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
80 * lower number of hinting faults have been recorded. As this has
81 * the potential to prevent a task ever migrating to a new node
82 * due to CPU overload it is disabled by default.
83 */
84SCHED_FEAT(NUMA_RESIST_LOWER, false)
72#endif 85#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 12select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..7d57275fc396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
246 * if we should look at the mask. It would be a shame 246 * if we should look at the mask. It would be a shame
247 * if we looked at the mask, but the mask was not 247 * if we looked at the mask, but the mask was not
248 * updated yet. 248 * updated yet.
249 *
250 * Matched by the barrier in pull_rt_task().
249 */ 251 */
250 wmb(); 252 smp_wmb();
251 atomic_inc(&rq->rd->rto_count); 253 atomic_inc(&rq->rd->rto_count);
252} 254}
253 255
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
1169static int find_lowest_rq(struct task_struct *task); 1171static int find_lowest_rq(struct task_struct *task);
1170 1172
1171static int 1173static int
1172select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1174select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1173{ 1175{
1174 struct task_struct *curr; 1176 struct task_struct *curr;
1175 struct rq *rq; 1177 struct rq *rq;
1176 int cpu;
1177
1178 cpu = task_cpu(p);
1179 1178
1180 if (p->nr_cpus_allowed == 1) 1179 if (p->nr_cpus_allowed == 1)
1181 goto out; 1180 goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1213 */ 1212 */
1214 if (curr && unlikely(rt_task(curr)) && 1213 if (curr && unlikely(rt_task(curr)) &&
1215 (curr->nr_cpus_allowed < 2 || 1214 (curr->nr_cpus_allowed < 2 ||
1216 curr->prio <= p->prio) && 1215 curr->prio <= p->prio)) {
1217 (p->nr_cpus_allowed > 1)) {
1218 int target = find_lowest_rq(p); 1216 int target = find_lowest_rq(p);
1219 1217
1220 if (target != -1) 1218 if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
1630 if (likely(!rt_overloaded(this_rq))) 1628 if (likely(!rt_overloaded(this_rq)))
1631 return 0; 1629 return 0;
1632 1630
1631 /*
1632 * Match the barrier from rt_set_overloaded; this guarantees that if we
1633 * see overloaded we must also see the rto_mask bit.
1634 */
1635 smp_rmb();
1636
1633 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1637 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1634 if (this_cpu == cpu) 1638 if (this_cpu == cpu)
1635 continue; 1639 continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1931 p->rt.time_slice = sched_rr_timeslice; 1935 p->rt.time_slice = sched_rr_timeslice;
1932 1936
1933 /* 1937 /*
1934 * Requeue to the end of queue if we (and all of our ancestors) are the 1938 * Requeue to the end of queue if we (and all of our ancestors) are not
1935 * only element on the queue 1939 * the only element on the queue
1936 */ 1940 */
1937 for_each_sched_rt_entity(rt_se) { 1941 for_each_sched_rt_entity(rt_se) {
1938 if (rt_se->run_list.prev != rt_se->run_list.next) { 1942 if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..4e650acffed7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h> 8#include <linux/tick.h>
9#include <linux/slab.h>
9 10
10#include "cpupri.h" 11#include "cpupri.h"
11#include "cpuacct.h" 12#include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
408 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
409 */ 410 */
410 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
411 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
412 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
413 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
476 u64 age_stamp; 481 u64 age_stamp;
477 u64 idle_stamp; 482 u64 idle_stamp;
478 u64 avg_idle; 483 u64 avg_idle;
484
485 /* This is used to determine avg_idle's max value */
486 u64 max_idle_balance_cost;
479#endif 487#endif
480 488
481#ifdef CONFIG_IRQ_TIME_ACCOUNTING 489#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
552 return rq->clock_task; 560 return rq->clock_task;
553} 561}
554 562
563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
565extern int migrate_task_to(struct task_struct *p, int cpu);
566extern int migrate_swap(struct task_struct *, struct task_struct *);
567#endif /* CONFIG_NUMA_BALANCING */
568
555#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
556 570
557#define rcu_dereference_check_sched_domain(p) \ 571#define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
593 return hsd; 607 return hsd;
594} 608}
595 609
610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
611{
612 struct sched_domain *sd;
613
614 for_each_domain(cpu, sd) {
615 if (sd->flags & flag)
616 break;
617 }
618
619 return sd;
620}
621
596DECLARE_PER_CPU(struct sched_domain *, sd_llc); 622DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size); 623DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 624DECLARE_PER_CPU(int, sd_llc_id);
625DECLARE_PER_CPU(struct sched_domain *, sd_numa);
599 626
600struct sched_group_power { 627struct sched_group_power {
601 atomic_t ref; 628 atomic_t ref;
@@ -605,6 +632,7 @@ struct sched_group_power {
605 */ 632 */
606 unsigned int power, power_orig; 633 unsigned int power, power_orig;
607 unsigned long next_update; 634 unsigned long next_update;
635 int imbalance; /* XXX unrelated to power but shared group state */
608 /* 636 /*
609 * Number of busy cpus in this group. 637 * Number of busy cpus in this group.
610 */ 638 */
@@ -719,6 +747,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
719 */ 747 */
720 smp_wmb(); 748 smp_wmb();
721 task_thread_info(p)->cpu = cpu; 749 task_thread_info(p)->cpu = cpu;
750 p->wake_cpu = cpu;
722#endif 751#endif
723} 752}
724 753
@@ -974,7 +1003,7 @@ struct sched_class {
974 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1003 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
975 1004
976#ifdef CONFIG_SMP 1005#ifdef CONFIG_SMP
977 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1006 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
978 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1007 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
979 1008
980 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1009 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1249,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1220 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1249 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1221} 1250}
1222 1251
1252static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1253{
1254 if (l1 > l2)
1255 swap(l1, l2);
1256
1257 spin_lock(l1);
1258 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1259}
1260
1261static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1262{
1263 if (l1 > l2)
1264 swap(l1, l2);
1265
1266 raw_spin_lock(l1);
1267 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1268}
1269
1223/* 1270/*
1224 * double_rq_lock - safely lock two runqueues 1271 * double_rq_lock - safely lock two runqueues
1225 * 1272 *
@@ -1305,7 +1352,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1305extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1352extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1306extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1353extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1307 1354
1308extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1355extern void cfs_bandwidth_usage_inc(void);
1356extern void cfs_bandwidth_usage_dec(void);
1309 1357
1310#ifdef CONFIG_NO_HZ_COMMON 1358#ifdef CONFIG_NO_HZ_COMMON
1311enum rq_nohz_flag_bits { 1359enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c7edee71bce8..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The 59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew. 60 * delta taken on each cpu would annul the skew.
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
63{ 63{
64 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 64 unsigned long long now = rq_clock(rq), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
69 sched_info_reset_dequeued(t); 69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta; 70 t->sched_info.run_delay += delta;
71 71
72 rq_sched_info_dequeued(task_rq(t), delta); 72 rq_sched_info_dequeued(rq, delta);
73} 73}
74 74
75/* 75/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
77 * long it was waiting to run. We also note when it began so that we 77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is. 78 * can keep stats on how long its timeslice is.
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct rq *rq, struct task_struct *t)
81{ 81{
82 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 82 unsigned long long now = rq_clock(rq), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
88 t->sched_info.last_arrival = now; 88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++; 89 t->sched_info.pcount++;
90 90
91 rq_sched_info_arrive(task_rq(t), delta); 91 rq_sched_info_arrive(rq, delta);
92} 92}
93 93
94/* 94/*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
96 * the timestamp if it is already not set. It's assumed that 96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate. 97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */ 98 */
99static inline void sched_info_queued(struct task_struct *t) 99static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = rq_clock(task_rq(t)); 103 t->sched_info.last_queued = rq_clock(rq);
104} 104}
105 105
106/* 106/*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
111 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
112 * the runqueue. 112 * the runqueue.
113 */ 113 */
114static inline void sched_info_depart(struct task_struct *t) 114static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
115{ 115{
116 unsigned long long delta = rq_clock(task_rq(t)) - 116 unsigned long long delta = rq_clock(rq) -
117 t->sched_info.last_arrival; 117 t->sched_info.last_arrival;
118 118
119 rq_sched_info_depart(task_rq(t), delta); 119 rq_sched_info_depart(rq, delta);
120 120
121 if (t->state == TASK_RUNNING) 121 if (t->state == TASK_RUNNING)
122 sched_info_queued(t); 122 sched_info_queued(rq, t);
123} 123}
124 124
125/* 125/*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
128 * the idle task.) We are only called when prev != next. 128 * the idle task.) We are only called when prev != next.
129 */ 129 */
130static inline void 130static inline void
131__sched_info_switch(struct task_struct *prev, struct task_struct *next) 131__sched_info_switch(struct rq *rq,
132 struct task_struct *prev, struct task_struct *next)
132{ 133{
133 struct rq *rq = task_rq(prev);
134
135 /* 134 /*
136 * prev now departs the cpu. It's not interesting to record 135 * prev now departs the cpu. It's not interesting to record
137 * stats about how efficient we were at scheduling the idle 136 * stats about how efficient we were at scheduling the idle
138 * process, however. 137 * process, however.
139 */ 138 */
140 if (prev != rq->idle) 139 if (prev != rq->idle)
141 sched_info_depart(prev); 140 sched_info_depart(rq, prev);
142 141
143 if (next != rq->idle) 142 if (next != rq->idle)
144 sched_info_arrive(next); 143 sched_info_arrive(rq, next);
145} 144}
146static inline void 145static inline void
147sched_info_switch(struct task_struct *prev, struct task_struct *next) 146sched_info_switch(struct rq *rq,
147 struct task_struct *prev, struct task_struct *next)
148{ 148{
149 if (unlikely(sched_info_on())) 149 if (unlikely(sched_info_on()))
150 __sched_info_switch(prev, next); 150 __sched_info_switch(rq, prev, next);
151} 151}
152#else 152#else
153#define sched_info_queued(t) do { } while (0) 153#define sched_info_queued(rq, t) do { } while (0)
154#define sched_info_reset_dequeued(t) do { } while (0) 154#define sched_info_reset_dequeued(t) do { } while (0)
155#define sched_info_dequeued(t) do { } while (0) 155#define sched_info_dequeued(rq, t) do { } while (0)
156#define sched_info_switch(t, next) do { } while (0) 156#define sched_info_depart(rq, t) do { } while (0)
157#define sched_info_arrive(rq, next) do { } while (0)
158#define sched_info_switch(rq, t, next) do { } while (0)
157#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 159#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
158 160
159/* 161/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
11 11
12#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
13static int 13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15{ 15{
16 return task_cpu(p); /* stop tasks as never migrate */ 16 return task_cpu(p); /* stop tasks as never migrate */
17} 17}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7d498d8cc4f..dcab1d3fb53d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -100,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
100 100
101 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
102 /* 102 /*
103 * The preempt tracer hooks into add_preempt_count and will break 103 * The preempt tracer hooks into preempt_count_add and will break
104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
105 * is set and before current->softirq_enabled is cleared. 105 * is set and before current->softirq_enabled is cleared.
106 * We must manually increment preempt_count here and manually 106 * We must manually increment preempt_count here and manually
107 * call the trace_preempt_off later. 107 * call the trace_preempt_off later.
108 */ 108 */
109 preempt_count() += cnt; 109 __preempt_count_add(cnt);
110 /* 110 /*
111 * Were softirqs turned off above: 111 * Were softirqs turned off above:
112 */ 112 */
@@ -120,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
120#else /* !CONFIG_TRACE_IRQFLAGS */ 120#else /* !CONFIG_TRACE_IRQFLAGS */
121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
122{ 122{
123 add_preempt_count(cnt); 123 preempt_count_add(cnt);
124 barrier(); 124 barrier();
125} 125}
126#endif /* CONFIG_TRACE_IRQFLAGS */ 126#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -139,7 +139,7 @@ static void __local_bh_enable(unsigned int cnt)
139 139
140 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
141 trace_softirqs_on(_RET_IP_); 141 trace_softirqs_on(_RET_IP_);
142 sub_preempt_count(cnt); 142 preempt_count_sub(cnt);
143} 143}
144 144
145/* 145/*
@@ -169,12 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip)
169 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
170 * softirq processing: 170 * softirq processing:
171 */ 171 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 172 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
173 173
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
175 do_softirq(); 175 do_softirq();
176 176
177 dec_preempt_count(); 177 preempt_count_dec();
178#ifdef CONFIG_TRACE_IRQFLAGS 178#ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 179 local_irq_enable();
180#endif 180#endif
@@ -256,7 +256,7 @@ restart:
256 " exited with %08x?\n", vec_nr, 256 " exited with %08x?\n", vec_nr,
257 softirq_to_name[vec_nr], h->action, 257 softirq_to_name[vec_nr], h->action,
258 prev_count, preempt_count()); 258 prev_count, preempt_count());
259 preempt_count() = prev_count; 259 preempt_count_set(prev_count);
260 } 260 }
261 261
262 rcu_bh_qs(cpu); 262 rcu_bh_qs(cpu);
@@ -369,7 +369,7 @@ void irq_exit(void)
369 369
370 account_irq_exit_time(current); 370 account_irq_exit_time(current);
371 trace_hardirq_exit(); 371 trace_hardirq_exit();
372 sub_preempt_count(HARDIRQ_OFFSET); 372 preempt_count_sub(HARDIRQ_OFFSET);
373 if (!in_interrupt() && local_softirq_pending()) 373 if (!in_interrupt() && local_softirq_pending())
374 invoke_softirq(); 374 invoke_softirq();
375 375
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..c530bc5be7cf 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -115,6 +115,182 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
115 return done.executed ? done.ret : -ENOENT; 115 return done.executed ? done.ret : -ENOENT;
116} 116}
117 117
118/* This controls the threads on each CPU. */
119enum multi_stop_state {
120 /* Dummy starting state for thread. */
121 MULTI_STOP_NONE,
122 /* Awaiting everyone to be scheduled. */
123 MULTI_STOP_PREPARE,
124 /* Disable interrupts. */
125 MULTI_STOP_DISABLE_IRQ,
126 /* Run the function */
127 MULTI_STOP_RUN,
128 /* Exit */
129 MULTI_STOP_EXIT,
130};
131
132struct multi_stop_data {
133 int (*fn)(void *);
134 void *data;
135 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
136 unsigned int num_threads;
137 const struct cpumask *active_cpus;
138
139 enum multi_stop_state state;
140 atomic_t thread_ack;
141};
142
143static void set_state(struct multi_stop_data *msdata,
144 enum multi_stop_state newstate)
145{
146 /* Reset ack counter. */
147 atomic_set(&msdata->thread_ack, msdata->num_threads);
148 smp_wmb();
149 msdata->state = newstate;
150}
151
152/* Last one to ack a state moves to the next state. */
153static void ack_state(struct multi_stop_data *msdata)
154{
155 if (atomic_dec_and_test(&msdata->thread_ack))
156 set_state(msdata, msdata->state + 1);
157}
158
159/* This is the cpu_stop function which stops the CPU. */
160static int multi_cpu_stop(void *data)
161{
162 struct multi_stop_data *msdata = data;
163 enum multi_stop_state curstate = MULTI_STOP_NONE;
164 int cpu = smp_processor_id(), err = 0;
165 unsigned long flags;
166 bool is_active;
167
168 /*
169 * When called from stop_machine_from_inactive_cpu(), irq might
170 * already be disabled. Save the state and restore it on exit.
171 */
172 local_save_flags(flags);
173
174 if (!msdata->active_cpus)
175 is_active = cpu == cpumask_first(cpu_online_mask);
176 else
177 is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
178
179 /* Simple state machine */
180 do {
181 /* Chill out and ensure we re-read multi_stop_state. */
182 cpu_relax();
183 if (msdata->state != curstate) {
184 curstate = msdata->state;
185 switch (curstate) {
186 case MULTI_STOP_DISABLE_IRQ:
187 local_irq_disable();
188 hard_irq_disable();
189 break;
190 case MULTI_STOP_RUN:
191 if (is_active)
192 err = msdata->fn(msdata->data);
193 break;
194 default:
195 break;
196 }
197 ack_state(msdata);
198 }
199 } while (curstate != MULTI_STOP_EXIT);
200
201 local_irq_restore(flags);
202 return err;
203}
204
205struct irq_cpu_stop_queue_work_info {
206 int cpu1;
207 int cpu2;
208 struct cpu_stop_work *work1;
209 struct cpu_stop_work *work2;
210};
211
212/*
213 * This function is always run with irqs and preemption disabled.
214 * This guarantees that both work1 and work2 get queued, before
215 * our local migrate thread gets the chance to preempt us.
216 */
217static void irq_cpu_stop_queue_work(void *arg)
218{
219 struct irq_cpu_stop_queue_work_info *info = arg;
220 cpu_stop_queue_work(info->cpu1, info->work1);
221 cpu_stop_queue_work(info->cpu2, info->work2);
222}
223
224/**
225 * stop_two_cpus - stops two cpus
226 * @cpu1: the cpu to stop
227 * @cpu2: the other cpu to stop
228 * @fn: function to execute
229 * @arg: argument to @fn
230 *
231 * Stops both the current and specified CPU and runs @fn on one of them.
232 *
233 * returns when both are completed.
234 */
235int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
236{
237 struct cpu_stop_done done;
238 struct cpu_stop_work work1, work2;
239 struct irq_cpu_stop_queue_work_info call_args;
240 struct multi_stop_data msdata;
241
242 preempt_disable();
243 msdata = (struct multi_stop_data){
244 .fn = fn,
245 .data = arg,
246 .num_threads = 2,
247 .active_cpus = cpumask_of(cpu1),
248 };
249
250 work1 = work2 = (struct cpu_stop_work){
251 .fn = multi_cpu_stop,
252 .arg = &msdata,
253 .done = &done
254 };
255
256 call_args = (struct irq_cpu_stop_queue_work_info){
257 .cpu1 = cpu1,
258 .cpu2 = cpu2,
259 .work1 = &work1,
260 .work2 = &work2,
261 };
262
263 cpu_stop_init_done(&done, 2);
264 set_state(&msdata, MULTI_STOP_PREPARE);
265
266 /*
267 * If we observe both CPUs active we know _cpu_down() cannot yet have
268 * queued its stop_machine works and therefore ours will get executed
269 * first. Or its not either one of our CPUs that's getting unplugged,
270 * in which case we don't care.
271 *
272 * This relies on the stopper workqueues to be FIFO.
273 */
274 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
275 preempt_enable();
276 return -ENOENT;
277 }
278
279 /*
280 * Queuing needs to be done by the lowest numbered CPU, to ensure
281 * that works are always queued in the same order on every CPU.
282 * This prevents deadlocks.
283 */
284 smp_call_function_single(min(cpu1, cpu2),
285 &irq_cpu_stop_queue_work,
286 &call_args, 0);
287 preempt_enable();
288
289 wait_for_completion(&done.completion);
290
291 return done.executed ? done.ret : -ENOENT;
292}
293
118/** 294/**
119 * stop_one_cpu_nowait - stop a cpu but don't wait for completion 295 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
120 * @cpu: cpu to stop 296 * @cpu: cpu to stop
@@ -359,98 +535,14 @@ early_initcall(cpu_stop_init);
359 535
360#ifdef CONFIG_STOP_MACHINE 536#ifdef CONFIG_STOP_MACHINE
361 537
362/* This controls the threads on each CPU. */
363enum stopmachine_state {
364 /* Dummy starting state for thread. */
365 STOPMACHINE_NONE,
366 /* Awaiting everyone to be scheduled. */
367 STOPMACHINE_PREPARE,
368 /* Disable interrupts. */
369 STOPMACHINE_DISABLE_IRQ,
370 /* Run the function */
371 STOPMACHINE_RUN,
372 /* Exit */
373 STOPMACHINE_EXIT,
374};
375
376struct stop_machine_data {
377 int (*fn)(void *);
378 void *data;
379 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
380 unsigned int num_threads;
381 const struct cpumask *active_cpus;
382
383 enum stopmachine_state state;
384 atomic_t thread_ack;
385};
386
387static void set_state(struct stop_machine_data *smdata,
388 enum stopmachine_state newstate)
389{
390 /* Reset ack counter. */
391 atomic_set(&smdata->thread_ack, smdata->num_threads);
392 smp_wmb();
393 smdata->state = newstate;
394}
395
396/* Last one to ack a state moves to the next state. */
397static void ack_state(struct stop_machine_data *smdata)
398{
399 if (atomic_dec_and_test(&smdata->thread_ack))
400 set_state(smdata, smdata->state + 1);
401}
402
403/* This is the cpu_stop function which stops the CPU. */
404static int stop_machine_cpu_stop(void *data)
405{
406 struct stop_machine_data *smdata = data;
407 enum stopmachine_state curstate = STOPMACHINE_NONE;
408 int cpu = smp_processor_id(), err = 0;
409 unsigned long flags;
410 bool is_active;
411
412 /*
413 * When called from stop_machine_from_inactive_cpu(), irq might
414 * already be disabled. Save the state and restore it on exit.
415 */
416 local_save_flags(flags);
417
418 if (!smdata->active_cpus)
419 is_active = cpu == cpumask_first(cpu_online_mask);
420 else
421 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
422
423 /* Simple state machine */
424 do {
425 /* Chill out and ensure we re-read stopmachine_state. */
426 cpu_relax();
427 if (smdata->state != curstate) {
428 curstate = smdata->state;
429 switch (curstate) {
430 case STOPMACHINE_DISABLE_IRQ:
431 local_irq_disable();
432 hard_irq_disable();
433 break;
434 case STOPMACHINE_RUN:
435 if (is_active)
436 err = smdata->fn(smdata->data);
437 break;
438 default:
439 break;
440 }
441 ack_state(smdata);
442 }
443 } while (curstate != STOPMACHINE_EXIT);
444
445 local_irq_restore(flags);
446 return err;
447}
448
449int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 538int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
450{ 539{
451 struct stop_machine_data smdata = { .fn = fn, .data = data, 540 struct multi_stop_data msdata = {
452 .num_threads = num_online_cpus(), 541 .fn = fn,
453 .active_cpus = cpus }; 542 .data = data,
543 .num_threads = num_online_cpus(),
544 .active_cpus = cpus,
545 };
454 546
455 if (!stop_machine_initialized) { 547 if (!stop_machine_initialized) {
456 /* 548 /*
@@ -461,7 +553,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
461 unsigned long flags; 553 unsigned long flags;
462 int ret; 554 int ret;
463 555
464 WARN_ON_ONCE(smdata.num_threads != 1); 556 WARN_ON_ONCE(msdata.num_threads != 1);
465 557
466 local_irq_save(flags); 558 local_irq_save(flags);
467 hard_irq_disable(); 559 hard_irq_disable();
@@ -472,8 +564,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
472 } 564 }
473 565
474 /* Set the initial state and stop all online cpus. */ 566 /* Set the initial state and stop all online cpus. */
475 set_state(&smdata, STOPMACHINE_PREPARE); 567 set_state(&msdata, MULTI_STOP_PREPARE);
476 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 568 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
477} 569}
478 570
479int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 571int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +605,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
513int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 605int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
514 const struct cpumask *cpus) 606 const struct cpumask *cpus)
515{ 607{
516 struct stop_machine_data smdata = { .fn = fn, .data = data, 608 struct multi_stop_data msdata = { .fn = fn, .data = data,
517 .active_cpus = cpus }; 609 .active_cpus = cpus };
518 struct cpu_stop_done done; 610 struct cpu_stop_done done;
519 int ret; 611 int ret;
520 612
521 /* Local CPU must be inactive and CPU hotplug in progress. */ 613 /* Local CPU must be inactive and CPU hotplug in progress. */
522 BUG_ON(cpu_active(raw_smp_processor_id())); 614 BUG_ON(cpu_active(raw_smp_processor_id()));
523 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ 615 msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
524 616
525 /* No proper task established and can't sleep - busy wait for lock. */ 617 /* No proper task established and can't sleep - busy wait for lock. */
526 while (!mutex_trylock(&stop_cpus_mutex)) 618 while (!mutex_trylock(&stop_cpus_mutex))
527 cpu_relax(); 619 cpu_relax();
528 620
529 /* Schedule work on other CPUs and execute directly for local CPU */ 621 /* Schedule work on other CPUs and execute directly for local CPU */
530 set_state(&smdata, STOPMACHINE_PREPARE); 622 set_state(&msdata, MULTI_STOP_PREPARE);
531 cpu_stop_init_done(&done, num_active_cpus()); 623 cpu_stop_init_done(&done, num_active_cpus());
532 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, 624 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
533 &done); 625 &done);
534 ret = stop_machine_cpu_stop(&smdata); 626 ret = multi_cpu_stop(&msdata);
535 627
536 /* Busy wait for completion. */ 628 /* Busy wait for completion. */
537 while (!completion_done(&done.completion)) 629 while (!completion_done(&done.completion))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2f06f3c6a3f..a159e1fd2013 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373 { 373 {
374 .procname = "numa_balancing_scan_period_reset",
375 .data = &sysctl_numa_balancing_scan_period_reset,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = proc_dointvec,
379 },
380 {
381 .procname = "numa_balancing_scan_period_max_ms", 374 .procname = "numa_balancing_scan_period_max_ms",
382 .data = &sysctl_numa_balancing_scan_period_max, 375 .data = &sysctl_numa_balancing_scan_period_max,
383 .maxlen = sizeof(unsigned int), 376 .maxlen = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 384 .mode = 0644,
392 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
393 }, 386 },
387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1093 unsigned long data) 1093 unsigned long data)
1094{ 1094{
1095 int preempt_count = preempt_count(); 1095 int count = preempt_count();
1096 1096
1097#ifdef CONFIG_LOCKDEP 1097#ifdef CONFIG_LOCKDEP
1098 /* 1098 /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1119 1119
1120 lock_map_release(&lockdep_map); 1120 lock_map_release(&lockdep_map);
1121 1121
1122 if (preempt_count != preempt_count()) { 1122 if (count != preempt_count()) {
1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1124 fn, preempt_count, preempt_count()); 1124 fn, count, preempt_count());
1125 /* 1125 /*
1126 * Restore the preempt count. That gives us a decent 1126 * Restore the preempt count. That gives us a decent
1127 * chance to survive and extract information. If the 1127 * chance to survive and extract information. If the
1128 * callback kept a lock held, bad luck, but not worse 1128 * callback kept a lock held, bad luck, but not worse
1129 * than the BUG() we had. 1129 * than the BUG() we had.
1130 */ 1130 */
1131 preempt_count() = preempt_count; 1131 preempt_count_set(count);
1132 } 1132 }
1133} 1133}
1134 1134
diff --git a/kernel/wait.c b/kernel/wait.c
index d550920e040c..de21c6305a44 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,6 +92,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
96{
97 unsigned long flags;
98
99 if (signal_pending_state(state, current))
100 return -ERESTARTSYS;
101
102 wait->private = current;
103 wait->func = autoremove_wake_function;
104
105 spin_lock_irqsave(&q->lock, flags);
106 if (list_empty(&wait->task_list)) {
107 if (wait->flags & WQ_FLAG_EXCLUSIVE)
108 __add_wait_queue_tail(q, wait);
109 else
110 __add_wait_queue(q, wait);
111 }
112 set_current_state(state);
113 spin_unlock_irqrestore(&q->lock, flags);
114
115 return 0;
116}
117EXPORT_SYMBOL(prepare_to_wait_event);
118
95/** 119/**
96 * finish_wait - clean up after waiting in a queue 120 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 121 * @q: waitqueue waited on
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6dc09d8f4c24..872a15a2a637 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
1002 * Some tests (e.g. double-unlock) might corrupt the preemption 1002 * Some tests (e.g. double-unlock) might corrupt the preemption
1003 * count, so restore it: 1003 * count, so restore it:
1004 */ 1004 */
1005 preempt_count() = saved_preempt_count; 1005 preempt_count_set(saved_preempt_count);
1006#ifdef CONFIG_TRACE_IRQFLAGS 1006#ifdef CONFIG_TRACE_IRQFLAGS
1007 if (softirq_count()) 1007 if (softirq_count())
1008 current->softirqs_enabled = 0; 1008 current->softirqs_enabled = 0;
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e51d49e..04abe53f12a1 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -9,10 +9,9 @@
9 9
10notrace unsigned int debug_smp_processor_id(void) 10notrace unsigned int debug_smp_processor_id(void)
11{ 11{
12 unsigned long preempt_count = preempt_count();
13 int this_cpu = raw_smp_processor_id(); 12 int this_cpu = raw_smp_processor_id();
14 13
15 if (likely(preempt_count)) 14 if (likely(preempt_count()))
16 goto out; 15 goto out;
17 16
18 if (irqs_disabled()) 17 if (irqs_disabled())
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cca80d96e509..2612f60f53ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1282,19 +1282,32 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1282 struct page *page; 1282 struct page *page;
1283 unsigned long haddr = addr & HPAGE_PMD_MASK; 1283 unsigned long haddr = addr & HPAGE_PMD_MASK;
1284 int page_nid = -1, this_nid = numa_node_id(); 1284 int page_nid = -1, this_nid = numa_node_id();
1285 int target_nid; 1285 int target_nid, last_cpupid = -1;
1286 bool page_locked; 1286 bool page_locked;
1287 bool migrated = false; 1287 bool migrated = false;
1288 int flags = 0;
1288 1289
1289 spin_lock(&mm->page_table_lock); 1290 spin_lock(&mm->page_table_lock);
1290 if (unlikely(!pmd_same(pmd, *pmdp))) 1291 if (unlikely(!pmd_same(pmd, *pmdp)))
1291 goto out_unlock; 1292 goto out_unlock;
1292 1293
1293 page = pmd_page(pmd); 1294 page = pmd_page(pmd);
1295 BUG_ON(is_huge_zero_page(page));
1294 page_nid = page_to_nid(page); 1296 page_nid = page_to_nid(page);
1297 last_cpupid = page_cpupid_last(page);
1295 count_vm_numa_event(NUMA_HINT_FAULTS); 1298 count_vm_numa_event(NUMA_HINT_FAULTS);
1296 if (page_nid == this_nid) 1299 if (page_nid == this_nid) {
1297 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1300 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1301 flags |= TNF_FAULT_LOCAL;
1302 }
1303
1304 /*
1305 * Avoid grouping on DSO/COW pages in specific and RO pages
1306 * in general, RO pages shouldn't hurt as much anyway since
1307 * they can be in shared cache state.
1308 */
1309 if (!pmd_write(pmd))
1310 flags |= TNF_NO_GROUP;
1298 1311
1299 /* 1312 /*
1300 * Acquire the page lock to serialise THP migrations but avoid dropping 1313 * Acquire the page lock to serialise THP migrations but avoid dropping
@@ -1325,7 +1338,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1325 lock_page(page); 1338 lock_page(page);
1326 anon_vma = page_lock_anon_vma_read(page); 1339 anon_vma = page_lock_anon_vma_read(page);
1327 1340
1328 /* Confirm the PTE did not while locked */ 1341 /* Confirm the PMD did not change while page_table_lock was released */
1329 spin_lock(&mm->page_table_lock); 1342 spin_lock(&mm->page_table_lock);
1330 if (unlikely(!pmd_same(pmd, *pmdp))) { 1343 if (unlikely(!pmd_same(pmd, *pmdp))) {
1331 unlock_page(page); 1344 unlock_page(page);
@@ -1341,8 +1354,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1341 spin_unlock(&mm->page_table_lock); 1354 spin_unlock(&mm->page_table_lock);
1342 migrated = migrate_misplaced_transhuge_page(mm, vma, 1355 migrated = migrate_misplaced_transhuge_page(mm, vma,
1343 pmdp, pmd, addr, page, target_nid); 1356 pmdp, pmd, addr, page, target_nid);
1344 if (migrated) 1357 if (migrated) {
1358 flags |= TNF_MIGRATED;
1345 page_nid = target_nid; 1359 page_nid = target_nid;
1360 }
1346 1361
1347 goto out; 1362 goto out;
1348clear_pmdnuma: 1363clear_pmdnuma:
@@ -1360,7 +1375,7 @@ out:
1360 page_unlock_anon_vma_read(anon_vma); 1375 page_unlock_anon_vma_read(anon_vma);
1361 1376
1362 if (page_nid != -1) 1377 if (page_nid != -1)
1363 task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); 1378 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
1364 1379
1365 return 0; 1380 return 0;
1366} 1381}
@@ -1458,6 +1473,12 @@ out:
1458 return ret; 1473 return ret;
1459} 1474}
1460 1475
1476/*
1477 * Returns
1478 * - 0 if PMD could not be locked
1479 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1480 * - HPAGE_PMD_NR is protections changed and TLB flush necessary
1481 */
1461int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1482int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1462 unsigned long addr, pgprot_t newprot, int prot_numa) 1483 unsigned long addr, pgprot_t newprot, int prot_numa)
1463{ 1484{
@@ -1466,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1466 1487
1467 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1488 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1468 pmd_t entry; 1489 pmd_t entry;
1469 entry = pmdp_get_and_clear(mm, addr, pmd); 1490 ret = 1;
1470 if (!prot_numa) { 1491 if (!prot_numa) {
1492 entry = pmdp_get_and_clear(mm, addr, pmd);
1471 entry = pmd_modify(entry, newprot); 1493 entry = pmd_modify(entry, newprot);
1494 ret = HPAGE_PMD_NR;
1472 BUG_ON(pmd_write(entry)); 1495 BUG_ON(pmd_write(entry));
1473 } else { 1496 } else {
1474 struct page *page = pmd_page(*pmd); 1497 struct page *page = pmd_page(*pmd);
1475 1498
1476 /* only check non-shared pages */ 1499 /*
1477 if (page_mapcount(page) == 1 && 1500 * Do not trap faults against the zero page. The
1501 * read-only data is likely to be read-cached on the
1502 * local CPU cache and it is less useful to know about
1503 * local vs remote hits on the zero page.
1504 */
1505 if (!is_huge_zero_page(page) &&
1478 !pmd_numa(*pmd)) { 1506 !pmd_numa(*pmd)) {
1507 entry = pmdp_get_and_clear(mm, addr, pmd);
1479 entry = pmd_mknuma(entry); 1508 entry = pmd_mknuma(entry);
1509 ret = HPAGE_PMD_NR;
1480 } 1510 }
1481 } 1511 }
1482 set_pmd_at(mm, addr, pmd, entry); 1512
1513 /* Set PMD if cleared earlier */
1514 if (ret == HPAGE_PMD_NR)
1515 set_pmd_at(mm, addr, pmd, entry);
1516
1483 spin_unlock(&vma->vm_mm->page_table_lock); 1517 spin_unlock(&vma->vm_mm->page_table_lock);
1484 ret = 1;
1485 } 1518 }
1486 1519
1487 return ret; 1520 return ret;
@@ -1662,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page,
1662 page_tail->mapping = page->mapping; 1695 page_tail->mapping = page->mapping;
1663 1696
1664 page_tail->index = page->index + i; 1697 page_tail->index = page->index + i;
1665 page_nid_xchg_last(page_tail, page_nid_last(page)); 1698 page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
1666 1699
1667 BUG_ON(!PageAnon(page_tail)); 1700 BUG_ON(!PageAnon(page_tail));
1668 BUG_ON(!PageUptodate(page_tail)); 1701 BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index d176154c243f..1f2287eaa88e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
74#endif 74#endif
75 75
76#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -2721,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2721 get_page(dirty_page); 2721 get_page(dirty_page);
2722 2722
2723reuse: 2723reuse:
2724 /*
2725 * Clear the pages cpupid information as the existing
2726 * information potentially belongs to a now completely
2727 * unrelated process.
2728 */
2729 if (old_page)
2730 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2731
2724 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2732 flush_cache_page(vma, address, pte_pfn(orig_pte));
2725 entry = pte_mkyoung(orig_pte); 2733 entry = pte_mkyoung(orig_pte);
2726 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2734 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3521,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3521} 3529}
3522 3530
3523int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3531int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3524 unsigned long addr, int page_nid) 3532 unsigned long addr, int page_nid,
3533 int *flags)
3525{ 3534{
3526 get_page(page); 3535 get_page(page);
3527 3536
3528 count_vm_numa_event(NUMA_HINT_FAULTS); 3537 count_vm_numa_event(NUMA_HINT_FAULTS);
3529 if (page_nid == numa_node_id()) 3538 if (page_nid == numa_node_id()) {
3530 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3539 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3540 *flags |= TNF_FAULT_LOCAL;
3541 }
3531 3542
3532 return mpol_misplaced(page, vma, addr); 3543 return mpol_misplaced(page, vma, addr);
3533} 3544}
@@ -3538,8 +3549,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3538 struct page *page = NULL; 3549 struct page *page = NULL;
3539 spinlock_t *ptl; 3550 spinlock_t *ptl;
3540 int page_nid = -1; 3551 int page_nid = -1;
3552 int last_cpupid;
3541 int target_nid; 3553 int target_nid;
3542 bool migrated = false; 3554 bool migrated = false;
3555 int flags = 0;
3543 3556
3544 /* 3557 /*
3545 * The "pte" at this point cannot be used safely without 3558 * The "pte" at this point cannot be used safely without
@@ -3566,9 +3579,26 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3566 pte_unmap_unlock(ptep, ptl); 3579 pte_unmap_unlock(ptep, ptl);
3567 return 0; 3580 return 0;
3568 } 3581 }
3582 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3569 3583
3584 /*
3585 * Avoid grouping on DSO/COW pages in specific and RO pages
3586 * in general, RO pages shouldn't hurt as much anyway since
3587 * they can be in shared cache state.
3588 */
3589 if (!pte_write(pte))
3590 flags |= TNF_NO_GROUP;
3591
3592 /*
3593 * Flag if the page is shared between multiple address spaces. This
3594 * is later used when determining whether to group tasks together
3595 */
3596 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3597 flags |= TNF_SHARED;
3598
3599 last_cpupid = page_cpupid_last(page);
3570 page_nid = page_to_nid(page); 3600 page_nid = page_to_nid(page);
3571 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3601 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3572 pte_unmap_unlock(ptep, ptl); 3602 pte_unmap_unlock(ptep, ptl);
3573 if (target_nid == -1) { 3603 if (target_nid == -1) {
3574 put_page(page); 3604 put_page(page);
@@ -3576,102 +3606,17 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3576 } 3606 }
3577 3607
3578 /* Migrate to the requested node */ 3608 /* Migrate to the requested node */
3579 migrated = migrate_misplaced_page(page, target_nid); 3609 migrated = migrate_misplaced_page(page, vma, target_nid);
3580 if (migrated) 3610 if (migrated) {
3581 page_nid = target_nid; 3611 page_nid = target_nid;
3612 flags |= TNF_MIGRATED;
3613 }
3582 3614
3583out: 3615out:
3584 if (page_nid != -1) 3616 if (page_nid != -1)
3585 task_numa_fault(page_nid, 1, migrated); 3617 task_numa_fault(last_cpupid, page_nid, 1, flags);
3586 return 0;
3587}
3588
3589/* NUMA hinting page fault entry point for regular pmds */
3590#ifdef CONFIG_NUMA_BALANCING
3591static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3592 unsigned long addr, pmd_t *pmdp)
3593{
3594 pmd_t pmd;
3595 pte_t *pte, *orig_pte;
3596 unsigned long _addr = addr & PMD_MASK;
3597 unsigned long offset;
3598 spinlock_t *ptl;
3599 bool numa = false;
3600
3601 spin_lock(&mm->page_table_lock);
3602 pmd = *pmdp;
3603 if (pmd_numa(pmd)) {
3604 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3605 numa = true;
3606 }
3607 spin_unlock(&mm->page_table_lock);
3608
3609 if (!numa)
3610 return 0;
3611
3612 /* we're in a page fault so some vma must be in the range */
3613 BUG_ON(!vma);
3614 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3615 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3616 VM_BUG_ON(offset >= PMD_SIZE);
3617 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3618 pte += offset >> PAGE_SHIFT;
3619 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3620 pte_t pteval = *pte;
3621 struct page *page;
3622 int page_nid = -1;
3623 int target_nid;
3624 bool migrated = false;
3625
3626 if (!pte_present(pteval))
3627 continue;
3628 if (!pte_numa(pteval))
3629 continue;
3630 if (addr >= vma->vm_end) {
3631 vma = find_vma(mm, addr);
3632 /* there's a pte present so there must be a vma */
3633 BUG_ON(!vma);
3634 BUG_ON(addr < vma->vm_start);
3635 }
3636 if (pte_numa(pteval)) {
3637 pteval = pte_mknonnuma(pteval);
3638 set_pte_at(mm, addr, pte, pteval);
3639 }
3640 page = vm_normal_page(vma, addr, pteval);
3641 if (unlikely(!page))
3642 continue;
3643 /* only check non-shared pages */
3644 if (unlikely(page_mapcount(page) != 1))
3645 continue;
3646
3647 page_nid = page_to_nid(page);
3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3649 pte_unmap_unlock(pte, ptl);
3650 if (target_nid != -1) {
3651 migrated = migrate_misplaced_page(page, target_nid);
3652 if (migrated)
3653 page_nid = target_nid;
3654 } else {
3655 put_page(page);
3656 }
3657
3658 if (page_nid != -1)
3659 task_numa_fault(page_nid, 1, migrated);
3660
3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3662 }
3663 pte_unmap_unlock(orig_pte, ptl);
3664
3665 return 0;
3666}
3667#else
3668static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3669 unsigned long addr, pmd_t *pmdp)
3670{
3671 BUG();
3672 return 0; 3618 return 0;
3673} 3619}
3674#endif /* CONFIG_NUMA_BALANCING */
3675 3620
3676/* 3621/*
3677 * These routines also need to handle stuff like marking pages dirty 3622 * These routines also need to handle stuff like marking pages dirty
@@ -3811,8 +3756,8 @@ retry:
3811 } 3756 }
3812 } 3757 }
3813 3758
3814 if (pmd_numa(*pmd)) 3759 /* THP should already have been handled */
3815 return do_pmd_numa_page(mm, vma, address, pmd); 3760 BUG_ON(pmd_numa(*pmd));
3816 3761
3817 /* 3762 /*
3818 * Use __pte_alloc instead of pte_alloc_map, because we can't 3763 * Use __pte_alloc instead of pte_alloc_map, because we can't
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1679 return pol; 1679 return pol;
1680} 1680}
1681 1681
1682bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1683{
1684 struct mempolicy *pol = get_task_policy(task);
1685 if (vma) {
1686 if (vma->vm_ops && vma->vm_ops->get_policy) {
1687 bool ret = false;
1688
1689 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1690 if (pol && (pol->flags & MPOL_F_MOF))
1691 ret = true;
1692 mpol_cond_put(pol);
1693
1694 return ret;
1695 } else if (vma->vm_policy) {
1696 pol = vma->vm_policy;
1697 }
1698 }
1699
1700 if (!pol)
1701 return default_policy.flags & MPOL_F_MOF;
1702
1703 return pol->flags & MPOL_F_MOF;
1704}
1705
1682static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1706static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683{ 1707{
1684 enum zone_type dynamic_policy_zone = policy_zone; 1708 enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
2277 kmem_cache_free(sn_cache, n); 2301 kmem_cache_free(sn_cache, n);
2278} 2302}
2279 2303
2304#ifdef CONFIG_NUMA_BALANCING
2305static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306{
2307 /* Never defer a private fault */
2308 if (cpupid_match_pid(p, last_cpupid))
2309 return false;
2310
2311 if (p->numa_migrate_deferred) {
2312 p->numa_migrate_deferred--;
2313 return true;
2314 }
2315 return false;
2316}
2317
2318static inline void defer_numa_migrate(struct task_struct *p)
2319{
2320 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321}
2322#else
2323static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324{
2325 return false;
2326}
2327
2328static inline void defer_numa_migrate(struct task_struct *p)
2329{
2330}
2331#endif /* CONFIG_NUMA_BALANCING */
2332
2280/** 2333/**
2281 * mpol_misplaced - check whether current page node is valid in policy 2334 * mpol_misplaced - check whether current page node is valid in policy
2282 * 2335 *
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2300 struct zone *zone; 2353 struct zone *zone;
2301 int curnid = page_to_nid(page); 2354 int curnid = page_to_nid(page);
2302 unsigned long pgoff; 2355 unsigned long pgoff;
2356 int thiscpu = raw_smp_processor_id();
2357 int thisnid = cpu_to_node(thiscpu);
2303 int polnid = -1; 2358 int polnid = -1;
2304 int ret = -1; 2359 int ret = -1;
2305 2360
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2348 2403
2349 /* Migrate the page towards the node whose CPU is referencing it */ 2404 /* Migrate the page towards the node whose CPU is referencing it */
2350 if (pol->flags & MPOL_F_MORON) { 2405 if (pol->flags & MPOL_F_MORON) {
2351 int last_nid; 2406 int last_cpupid;
2407 int this_cpupid;
2352 2408
2353 polnid = numa_node_id(); 2409 polnid = thisnid;
2410 this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2354 2411
2355 /* 2412 /*
2356 * Multi-stage node selection is used in conjunction 2413 * Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2373 * it less likely we act on an unlikely task<->page 2430 * it less likely we act on an unlikely task<->page
2374 * relation. 2431 * relation.
2375 */ 2432 */
2376 last_nid = page_nid_xchg_last(page, polnid); 2433 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2377 if (last_nid != polnid) 2434 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435
2436 /* See sysctl_numa_balancing_migrate_deferred comment */
2437 if (!cpupid_match_pid(current, last_cpupid))
2438 defer_numa_migrate(current);
2439
2440 goto out;
2441 }
2442
2443 /*
2444 * The quadratic filter above reduces extraneous migration
2445 * of shared pages somewhat. This code reduces it even more,
2446 * reducing the overhead of page migrations of shared pages.
2447 * This makes workloads with shared pages rely more on
2448 * "move task near its memory", and less on "move memory
2449 * towards its task", which is exactly what we want.
2450 */
2451 if (numa_migrate_deferred(current, last_cpupid))
2378 goto out; 2452 goto out;
2379 } 2453 }
2380 2454
diff --git a/mm/migrate.c b/mm/migrate.c
index c04692774e88..dfc8300ecbb2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,6 +445,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
445 */ 445 */
446void migrate_page_copy(struct page *newpage, struct page *page) 446void migrate_page_copy(struct page *newpage, struct page *page)
447{ 447{
448 int cpupid;
449
448 if (PageHuge(page) || PageTransHuge(page)) 450 if (PageHuge(page) || PageTransHuge(page))
449 copy_huge_page(newpage, page); 451 copy_huge_page(newpage, page);
450 else 452 else
@@ -481,6 +483,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
481 __set_page_dirty_nobuffers(newpage); 483 __set_page_dirty_nobuffers(newpage);
482 } 484 }
483 485
486 /*
487 * Copy NUMA information to the new page, to prevent over-eager
488 * future migrations of this same page.
489 */
490 cpupid = page_cpupid_xchg_last(page, -1);
491 page_cpupid_xchg_last(newpage, cpupid);
492
484 mlock_migrate_page(newpage, page); 493 mlock_migrate_page(newpage, page);
485 ksm_migrate_page(newpage, page); 494 ksm_migrate_page(newpage, page);
486 /* 495 /*
@@ -1500,7 +1509,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1500 __GFP_NOWARN) & 1509 __GFP_NOWARN) &
1501 ~GFP_IOFS, 0); 1510 ~GFP_IOFS, 0);
1502 if (newpage) 1511 if (newpage)
1503 page_nid_xchg_last(newpage, page_nid_last(page)); 1512 page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1504 1513
1505 return newpage; 1514 return newpage;
1506} 1515}
@@ -1601,7 +1610,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1601 * node. Caller is expected to have an elevated reference count on 1610 * node. Caller is expected to have an elevated reference count on
1602 * the page that will be dropped by this function before returning. 1611 * the page that will be dropped by this function before returning.
1603 */ 1612 */
1604int migrate_misplaced_page(struct page *page, int node) 1613int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1614 int node)
1605{ 1615{
1606 pg_data_t *pgdat = NODE_DATA(node); 1616 pg_data_t *pgdat = NODE_DATA(node);
1607 int isolated; 1617 int isolated;
@@ -1609,10 +1619,11 @@ int migrate_misplaced_page(struct page *page, int node)
1609 LIST_HEAD(migratepages); 1619 LIST_HEAD(migratepages);
1610 1620
1611 /* 1621 /*
1612 * Don't migrate pages that are mapped in multiple processes. 1622 * Don't migrate file pages that are mapped in multiple processes
1613 * TODO: Handle false sharing detection instead of this hammer 1623 * with execute permissions as they are probably shared libraries.
1614 */ 1624 */
1615 if (page_mapcount(page) != 1) 1625 if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1626 (vma->vm_flags & VM_EXEC))
1616 goto out; 1627 goto out;
1617 1628
1618 /* 1629 /*
@@ -1663,13 +1674,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1663 int page_lru = page_is_file_cache(page); 1674 int page_lru = page_is_file_cache(page);
1664 1675
1665 /* 1676 /*
1666 * Don't migrate pages that are mapped in multiple processes.
1667 * TODO: Handle false sharing detection instead of this hammer
1668 */
1669 if (page_mapcount(page) != 1)
1670 goto out_dropref;
1671
1672 /*
1673 * Rate-limit the amount of data that is being migrated to a node. 1677 * Rate-limit the amount of data that is being migrated to a node.
1674 * Optimal placement is no good if the memory bus is saturated and 1678 * Optimal placement is no good if the memory bus is saturated and
1675 * all the time is being spent migrating! 1679 * all the time is being spent migrating!
@@ -1682,7 +1686,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1682 if (!new_page) 1686 if (!new_page)
1683 goto out_fail; 1687 goto out_fail;
1684 1688
1685 page_nid_xchg_last(new_page, page_nid_last(page)); 1689 page_cpupid_xchg_last(new_page, page_cpupid_last(page));
1686 1690
1687 isolated = numamigrate_isolate_page(pgdat, page); 1691 isolated = numamigrate_isolate_page(pgdat, page);
1688 if (!isolated) { 1692 if (!isolated) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..68562e92d50c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
71 unsigned long or_mask, add_mask; 71 unsigned long or_mask, add_mask;
72 72
73 shift = 8 * sizeof(unsigned long); 73 shift = 8 * sizeof(unsigned long);
74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; 74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
76 "Section %d Node %d Zone %d Lastnid %d Flags %d\n", 76 "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
77 SECTIONS_WIDTH, 77 SECTIONS_WIDTH,
78 NODES_WIDTH, 78 NODES_WIDTH,
79 ZONES_WIDTH, 79 ZONES_WIDTH,
80 LAST_NID_WIDTH, 80 LAST_CPUPID_WIDTH,
81 NR_PAGEFLAGS); 81 NR_PAGEFLAGS);
82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
83 "Section %d Node %d Zone %d Lastnid %d\n", 83 "Section %d Node %d Zone %d Lastcpupid %d\n",
84 SECTIONS_SHIFT, 84 SECTIONS_SHIFT,
85 NODES_SHIFT, 85 NODES_SHIFT,
86 ZONES_SHIFT, 86 ZONES_SHIFT,
87 LAST_NID_SHIFT); 87 LAST_CPUPID_SHIFT);
88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", 88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
89 "Section %lu Node %lu Zone %lu Lastnid %lu\n", 89 "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
90 (unsigned long)SECTIONS_PGSHIFT, 90 (unsigned long)SECTIONS_PGSHIFT,
91 (unsigned long)NODES_PGSHIFT, 91 (unsigned long)NODES_PGSHIFT,
92 (unsigned long)ZONES_PGSHIFT, 92 (unsigned long)ZONES_PGSHIFT,
93 (unsigned long)LAST_NID_PGSHIFT); 93 (unsigned long)LAST_CPUPID_PGSHIFT);
94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", 94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
95 "Node/Zone ID: %lu -> %lu\n", 95 "Node/Zone ID: %lu -> %lu\n",
96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), 96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
103 "Node not in page flags"); 103 "Node not in page flags");
104#endif 104#endif
105#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 105#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
107 "Last nid not in page flags"); 107 "Last cpupid not in page flags");
108#endif 108#endif
109 109
110 if (SECTIONS_WIDTH) { 110 if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..bf34fb8556db 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99 99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) 100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid) 101int page_cpupid_xchg_last(struct page *page, int cpupid)
102{ 102{
103 unsigned long old_flags, flags; 103 unsigned long old_flags, flags;
104 int last_nid; 104 int last_cpupid;
105 105
106 do { 106 do {
107 old_flags = flags = page->flags; 107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page); 108 last_cpupid = page_cpupid_last(page);
109 109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 110 flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 111 flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); 112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113 113
114 return last_nid; 114 return last_cpupid;
115} 115}
116#endif 116#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 412ba2b7326a..a597f2ffcd6f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
37 37
38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable, int prot_numa, bool *ret_all_same_node) 40 int dirty_accountable, int prot_numa)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm; 42 struct mm_struct *mm = vma->vm_mm;
43 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
44 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0; 45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
48 46
49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 47 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
50 arch_enter_lazy_mmu_mode(); 48 arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 61
64 page = vm_normal_page(vma, addr, oldpte); 62 page = vm_normal_page(vma, addr, oldpte);
65 if (page) { 63 if (page) {
66 int this_nid = page_to_nid(page); 64 if (!pte_numa(oldpte)) {
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent); 65 ptent = pte_mknuma(ptent);
76 updated = true; 66 updated = true;
77 } 67 }
@@ -104,33 +94,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
104 if (pte_swp_soft_dirty(oldpte)) 94 if (pte_swp_soft_dirty(oldpte))
105 newpte = pte_swp_mksoft_dirty(newpte); 95 newpte = pte_swp_mksoft_dirty(newpte);
106 set_pte_at(mm, addr, pte, newpte); 96 set_pte_at(mm, addr, pte, newpte);
97
98 pages++;
107 } 99 }
108 pages++;
109 } 100 }
110 } while (pte++, addr += PAGE_SIZE, addr != end); 101 } while (pte++, addr += PAGE_SIZE, addr != end);
111 arch_leave_lazy_mmu_mode(); 102 arch_leave_lazy_mmu_mode();
112 pte_unmap_unlock(pte - 1, ptl); 103 pte_unmap_unlock(pte - 1, ptl);
113 104
114 *ret_all_same_node = all_same_node;
115 return pages; 105 return pages;
116} 106}
117 107
118#ifdef CONFIG_NUMA_BALANCING
119static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
120 pmd_t *pmd)
121{
122 spin_lock(&mm->page_table_lock);
123 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
124 spin_unlock(&mm->page_table_lock);
125}
126#else
127static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
128 pmd_t *pmd)
129{
130 BUG();
131}
132#endif /* CONFIG_NUMA_BALANCING */
133
134static inline unsigned long change_pmd_range(struct vm_area_struct *vma, 108static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
135 pud_t *pud, unsigned long addr, unsigned long end, 109 pud_t *pud, unsigned long addr, unsigned long end,
136 pgprot_t newprot, int dirty_accountable, int prot_numa) 110 pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -138,34 +112,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
138 pmd_t *pmd; 112 pmd_t *pmd;
139 unsigned long next; 113 unsigned long next;
140 unsigned long pages = 0; 114 unsigned long pages = 0;
141 bool all_same_node;
142 115
143 pmd = pmd_offset(pud, addr); 116 pmd = pmd_offset(pud, addr);
144 do { 117 do {
118 unsigned long this_pages;
119
145 next = pmd_addr_end(addr, end); 120 next = pmd_addr_end(addr, end);
146 if (pmd_trans_huge(*pmd)) { 121 if (pmd_trans_huge(*pmd)) {
147 if (next - addr != HPAGE_PMD_SIZE) 122 if (next - addr != HPAGE_PMD_SIZE)
148 split_huge_page_pmd(vma, addr, pmd); 123 split_huge_page_pmd(vma, addr, pmd);
149 else if (change_huge_pmd(vma, pmd, addr, newprot, 124 else {
150 prot_numa)) { 125 int nr_ptes = change_huge_pmd(vma, pmd, addr,
151 pages++; 126 newprot, prot_numa);
152 continue; 127
128 if (nr_ptes) {
129 if (nr_ptes == HPAGE_PMD_NR)
130 pages++;
131
132 continue;
133 }
153 } 134 }
154 /* fall through */ 135 /* fall through */
155 } 136 }
156 if (pmd_none_or_clear_bad(pmd)) 137 if (pmd_none_or_clear_bad(pmd))
157 continue; 138 continue;
158 pages += change_pte_range(vma, pmd, addr, next, newprot, 139 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
159 dirty_accountable, prot_numa, &all_same_node); 140 dirty_accountable, prot_numa);
160 141 pages += this_pages;
161 /*
162 * If we are changing protections for NUMA hinting faults then
163 * set pmd_numa if the examined pages were all on the same
164 * node. This allows a regular PMD to be handled as one fault
165 * and effectively batches the taking of the PTL
166 */
167 if (prot_numa && all_same_node)
168 change_pmd_protnuma(vma->vm_mm, addr, pmd);
169 } while (pmd++, addr = next, addr != end); 142 } while (pmd++, addr = next, addr != end);
170 143
171 return pages; 144 return pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fac451a..73d812f16dde 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
626 bad_page(page); 626 bad_page(page);
627 return 1; 627 return 1;
628 } 628 }
629 page_nid_reset_last(page); 629 page_cpupid_reset_last(page);
630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
632 return 0; 632 return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4015 mminit_verify_page_links(page, zone, nid, pfn); 4015 mminit_verify_page_links(page, zone, nid, pfn);
4016 init_page_count(page); 4016 init_page_count(page);
4017 page_mapcount_reset(page); 4017 page_mapcount_reset(page);
4018 page_nid_reset_last(page); 4018 page_cpupid_reset_last(page);
4019 SetPageReserved(page); 4019 SetPageReserved(page);
4020 /* 4020 /*
4021 * Mark the block movable so that blocks are reserved for 4021 * Mark the block movable so that blocks are reserved for
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 0578d4fa00a9..0f676908d15b 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2563,9 +2563,8 @@ bed:
2563 jiffies + msecs_to_jiffies(val)); 2563 jiffies + msecs_to_jiffies(val));
2564 2564
2565 /* Wait for IR-LMP to call us back */ 2565 /* Wait for IR-LMP to call us back */
2566 __wait_event_interruptible(self->query_wait, 2566 err = __wait_event_interruptible(self->query_wait,
2567 (self->cachedaddr != 0 || self->errno == -ETIME), 2567 (self->cachedaddr != 0 || self->errno == -ETIME));
2568 err);
2569 2568
2570 /* If watchdog is still activated, kill it! */ 2569 /* If watchdog is still activated, kill it! */
2571 del_timer(&(self->watchdog)); 2570 del_timer(&(self->watchdog));
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index f4484719f3e6..f63c2388f38d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1637,12 +1637,9 @@ static int sync_thread_master(void *data)
1637 continue; 1637 continue;
1638 } 1638 }
1639 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1639 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1640 int ret = 0; 1640 int ret = __wait_event_interruptible(*sk_sleep(sk),
1641
1642 __wait_event_interruptible(*sk_sleep(sk),
1643 sock_writeable(sk) || 1641 sock_writeable(sk) ||
1644 kthread_should_stop(), 1642 kthread_should_stop());
1645 ret);
1646 if (unlikely(kthread_should_stop())) 1643 if (unlikely(kthread_should_stop()))
1647 goto done; 1644 goto done;
1648 } 1645 }