aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/kernel.txt76
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/arc/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/avr32/include/asm/Kbuild1
-rw-r--r--arch/blackfin/include/asm/Kbuild1
-rw-r--r--arch/c6x/include/asm/Kbuild1
-rw-r--r--arch/cris/include/asm/Kbuild1
-rw-r--r--arch/frv/include/asm/Kbuild1
-rw-r--r--arch/h8300/include/asm/Kbuild1
-rw-r--r--arch/hexagon/include/asm/Kbuild1
-rw-r--r--arch/ia64/include/asm/Kbuild1
-rw-r--r--arch/m32r/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/metag/include/asm/Kbuild1
-rw-r--r--arch/metag/include/asm/topology.h2
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/mips/include/asm/Kbuild1
-rw-r--r--arch/mips/kernel/rtlx.c19
-rw-r--r--arch/mips/mm/init.c5
-rw-r--r--arch/mn10300/include/asm/Kbuild1
-rw-r--r--arch/openrisc/include/asm/Kbuild1
-rw-r--r--arch/parisc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/s390/include/asm/Kbuild1
-rw-r--r--arch/score/include/asm/Kbuild1
-rw-r--r--arch/sh/include/asm/Kbuild1
-rw-r--r--arch/sparc/include/asm/Kbuild1
-rw-r--r--arch/tile/include/asm/Kbuild1
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/unicore32/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/atomic.h29
-rw-r--r--arch/x86/include/asm/atomic64_64.h28
-rw-r--r--arch/x86/include/asm/bitops.h24
-rw-r--r--arch/x86/include/asm/calling.h50
-rw-r--r--arch/x86/include/asm/local.h28
-rw-r--r--arch/x86/include/asm/preempt.h100
-rw-r--r--arch/x86/include/asm/rmwcc.h41
-rw-r--r--arch/x86/include/asm/thread_info.h5
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/entry_32.S7
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c7
-rw-r--r--arch/x86/kernel/irq_32.c4
-rw-r--r--arch/x86/kernel/preempt.S25
-rw-r--r--arch/x86/kernel/process.c6
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c7
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--drivers/acpi/processor_idle.c46
-rw-r--r--drivers/idle/intel_idle.c2
-rw-r--r--fs/exec.c1
-rw-r--r--fs/proc/array.c2
-rw-r--r--include/asm-generic/preempt.h105
-rw-r--r--include/linux/hardirq.h8
-rw-r--r--include/linux/mempolicy.h1
-rw-r--r--include/linux/migrate.h7
-rw-r--r--include/linux/mm.h118
-rw-r--r--include/linux/mm_types.h17
-rw-r--r--include/linux/page-flags-layout.h28
-rw-r--r--include/linux/preempt.h112
-rw-r--r--include/linux/sched.h167
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/stop_machine.h1
-rw-r--r--include/linux/thread_info.h17
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/linux/tty.h28
-rw-r--r--include/linux/uaccess.h8
-rw-r--r--include/linux/wait.h364
-rw-r--r--include/trace/events/sched.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/bounds.c4
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/rcutree.c15
-rw-r--r--kernel/sched/core.c262
-rw-r--r--kernel/sched/debug.c60
-rw-r--r--kernel/sched/fair.c1313
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/sched.h31
-rw-r--r--kernel/sched/stats.h46
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/softirq.c16
-rw-r--r--kernel/stop_machine.c272
-rw-r--r--kernel/sysctl.c21
-rw-r--r--kernel/timer.c8
-rw-r--r--lib/locking-selftest.c2
-rw-r--r--lib/smp_processor_id.c3
-rw-r--r--mm/huge_memory.c119
-rw-r--r--mm/memory.c158
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c49
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c65
-rw-r--r--mm/page_alloc.c4
-rw-r--r--net/irda/af_irda.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c7
107 files changed, 3008 insertions, 1189 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 9d4c1d18ad44..4273b2d71a27 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -355,6 +355,82 @@ utilize.
355 355
356============================================================== 356==============================================================
357 357
358numa_balancing
359
360Enables/disables automatic page fault based NUMA memory
361balancing. Memory is moved automatically to nodes
362that access it often.
363
364Enables/disables automatic NUMA memory balancing. On NUMA machines, there
365is a performance penalty if remote memory is accessed by a CPU. When this
366feature is enabled the kernel samples what task thread is accessing memory
367by periodically unmapping pages and later trapping a page fault. At the
368time of the page fault, it is determined if the data being accessed should
369be migrated to a local memory node.
370
371The unmapping of pages and trapping faults incur additional overhead that
372ideally is offset by improved memory locality but there is no universal
373guarantee. If the target workload is already bound to NUMA nodes then this
374feature should be disabled. Otherwise, if the system overhead from the
375feature is too high then the rate the kernel samples for NUMA hinting
376faults may be controlled by the numa_balancing_scan_period_min_ms,
377numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
378numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
379numa_balancing_migrate_deferred.
380
381==============================================================
382
383numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms,
384numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
385
386Automatic NUMA balancing scans tasks address space and unmaps pages to
387detect if pages are properly placed or if the data should be migrated to a
388memory node local to where the task is running. Every "scan delay" the task
389scans the next "scan size" number of pages in its address space. When the
390end of the address space is reached the scanner restarts from the beginning.
391
392In combination, the "scan delay" and "scan size" determine the scan rate.
393When "scan delay" decreases, the scan rate increases. The scan delay and
394hence the scan rate of every task is adaptive and depends on historical
395behaviour. If pages are properly placed then the scan delay increases,
396otherwise the scan delay decreases. The "scan size" is not adaptive but
397the higher the "scan size", the higher the scan rate.
398
399Higher scan rates incur higher system overhead as page faults must be
400trapped and potentially data must be migrated. However, the higher the scan
401rate, the more quickly a tasks memory is migrated to a local node if the
402workload pattern changes and minimises performance impact due to remote
403memory accesses. These sysctls control the thresholds for scan delays and
404the number of pages scanned.
405
406numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
407scan a tasks virtual memory. It effectively controls the maximum scanning
408rate for each task.
409
410numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
411when it initially forks.
412
413numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
414scan a tasks virtual memory. It effectively controls the minimum scanning
415rate for each task.
416
417numa_balancing_scan_size_mb is how many megabytes worth of pages are
418scanned for a given scan.
419
420numa_balancing_settle_count is how many scan periods must complete before
421the schedule balancer stops pushing the task towards a preferred node. This
422gives the scheduler a chance to place the task on an alternative node if the
423preferred node is overloaded.
424
425numa_balancing_migrate_deferred is how many page migrations get skipped
426unconditionally, after a page migration is skipped because a page is shared
427with other tasks. This reduces page migration overhead, and determines
428how much stronger the "move task near its memory" policy scheduler becomes,
429versus the "move memory near its task" memory management policy, for workloads
430with shared memory.
431
432==============================================================
433
358osrelease, ostype & version: 434osrelease, ostype & version:
359 435
360# cat osrelease 436# cat osrelease
diff --git a/MAINTAINERS b/MAINTAINERS
index 8a0cbf3cf2c8..aee6733391cb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7282,6 +7282,8 @@ S: Maintained
7282F: kernel/sched/ 7282F: kernel/sched/
7283F: include/linux/sched.h 7283F: include/linux/sched.h
7284F: include/uapi/linux/sched.h 7284F: include/uapi/linux/sched.h
7285F: kernel/wait.c
7286F: include/linux/wait.h
7285 7287
7286SCORE ARCHITECTURE 7288SCORE ARCHITECTURE
7287M: Chen Liqin <liqin.linux@gmail.com> 7289M: Chen Liqin <liqin.linux@gmail.com>
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index a6e85f448c1c..f01fb505ad52 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
3 3
4generic-y += exec.h 4generic-y += exec.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index d8dd660898b9..5943f7f9d325 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -46,3 +46,4 @@ generic-y += ucontext.h
46generic-y += user.h 46generic-y += user.h
47generic-y += vga.h 47generic-y += vga.h
48generic-y += xor.h 48generic-y += xor.h
49generic-y += preempt.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index d3db39860b9c..4e6838d4ddf6 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -33,3 +33,4 @@ generic-y += timex.h
33generic-y += trace_clock.h 33generic-y += trace_clock.h
34generic-y += types.h 34generic-y += types.h
35generic-y += unaligned.h 35generic-y += unaligned.h
36generic-y += preempt.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 79a642d199f2..519f89f5b6a3 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -50,3 +50,4 @@ generic-y += unaligned.h
50generic-y += user.h 50generic-y += user.h
51generic-y += vga.h 51generic-y += vga.h
52generic-y += xor.h 52generic-y += xor.h
53generic-y += preempt.h
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
index fd7980743890..658001b52400 100644
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y += div64.h
7generic-y += emergency-restart.h 7generic-y += emergency-restart.h
8generic-y += exec.h 8generic-y += exec.h
9generic-y += futex.h 9generic-y += futex.h
10generic-y += preempt.h
10generic-y += irq_regs.h 11generic-y += irq_regs.h
11generic-y += param.h 12generic-y += param.h
12generic-y += local.h 13generic-y += local.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 127826f8a375..f2b43474b0e2 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -44,3 +44,4 @@ generic-y += ucontext.h
44generic-y += unaligned.h 44generic-y += unaligned.h
45generic-y += user.h 45generic-y += user.h
46generic-y += xor.h 46generic-y += xor.h
47generic-y += preempt.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index e49f918531ad..fc0b3c356027 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -56,3 +56,4 @@ generic-y += ucontext.h
56generic-y += user.h 56generic-y += user.h
57generic-y += vga.h 57generic-y += vga.h
58generic-y += xor.h 58generic-y += xor.h
59generic-y += preempt.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index c8325455520e..b06caf649a95 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += module.h
11generic-y += trace_clock.h 11generic-y += trace_clock.h
12generic-y += vga.h 12generic-y += vga.h
13generic-y += xor.h 13generic-y += xor.h
14generic-y += preempt.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index c5d767028306..74742dc6a3da 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -2,3 +2,4 @@
2generic-y += clkdev.h 2generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 8ada3cf0c98d..7e0e7213a481 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -6,3 +6,4 @@ generic-y += mmu.h
6generic-y += module.h 6generic-y += module.h
7generic-y += trace_clock.h 7generic-y += trace_clock.h
8generic-y += xor.h 8generic-y += xor.h
9generic-y += preempt.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 1da17caac23c..67c3450309b7 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -53,3 +53,4 @@ generic-y += types.h
53generic-y += ucontext.h 53generic-y += ucontext.h
54generic-y += unaligned.h 54generic-y += unaligned.h
55generic-y += xor.h 55generic-y += xor.h
56generic-y += preempt.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index a3456f34f672..f93ee087e8fe 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -3,4 +3,5 @@ generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += kvm_para.h 4generic-y += kvm_para.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
6generic-y += vtime.h \ No newline at end of file 7generic-y += vtime.h \ No newline at end of file
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index bebdc36ebb0a..2b58c5f0bc38 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += module.h 4generic-y += module.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 09d77a862da3..a5d27f272a59 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -31,3 +31,4 @@ generic-y += trace_clock.h
31generic-y += types.h 31generic-y += types.h
32generic-y += word-at-a-time.h 32generic-y += word-at-a-time.h
33generic-y += xor.h 33generic-y += xor.h
34generic-y += preempt.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 6ae0ccb632cb..84d0c1d6b9b3 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -52,3 +52,4 @@ generic-y += unaligned.h
52generic-y += user.h 52generic-y += user.h
53generic-y += vga.h 53generic-y += vga.h
54generic-y += xor.h 54generic-y += xor.h
55generic-y += preempt.h
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h
index 23f5118f58db..8e9c0b3b9691 100644
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -26,6 +26,8 @@
26 .last_balance = jiffies, \ 26 .last_balance = jiffies, \
27 .balance_interval = 1, \ 27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \ 28 .nr_balance_failed = 0, \
29 .max_newidle_lb_cost = 0, \
30 .next_decay_max_lb_cost = jiffies, \
29} 31}
30 32
31#define cpu_to_node(cpu) ((void)(cpu), 0) 33#define cpu_to_node(cpu) ((void)(cpu), 0)
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index d3c51a6a601d..ce0bbf8f5640 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += syscalls.h 5generic-y += syscalls.h
6generic-y += preempt.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 454ddf9bb76f..1acbb8b77a71 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -11,5 +11,6 @@ generic-y += sections.h
11generic-y += segment.h 11generic-y += segment.h
12generic-y += serial.h 12generic-y += serial.h
13generic-y += trace_clock.h 13generic-y += trace_clock.h
14generic-y += preempt.h
14generic-y += ucontext.h 15generic-y += ucontext.h
15generic-y += xor.h 16generic-y += xor.h
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c
index d763f11e35e2..2c12ea1668d1 100644
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -172,8 +172,9 @@ int rtlx_open(int index, int can_sleep)
172 if (rtlx == NULL) { 172 if (rtlx == NULL) {
173 if( (p = vpe_get_shared(tclimit)) == NULL) { 173 if( (p = vpe_get_shared(tclimit)) == NULL) {
174 if (can_sleep) { 174 if (can_sleep) {
175 __wait_event_interruptible(channel_wqs[index].lx_queue, 175 ret = __wait_event_interruptible(
176 (p = vpe_get_shared(tclimit)), ret); 176 channel_wqs[index].lx_queue,
177 (p = vpe_get_shared(tclimit)));
177 if (ret) 178 if (ret)
178 goto out_fail; 179 goto out_fail;
179 } else { 180 } else {
@@ -263,11 +264,10 @@ unsigned int rtlx_read_poll(int index, int can_sleep)
263 /* data available to read? */ 264 /* data available to read? */
264 if (chan->lx_read == chan->lx_write) { 265 if (chan->lx_read == chan->lx_write) {
265 if (can_sleep) { 266 if (can_sleep) {
266 int ret = 0; 267 int ret = __wait_event_interruptible(
267 268 channel_wqs[index].lx_queue,
268 __wait_event_interruptible(channel_wqs[index].lx_queue,
269 (chan->lx_read != chan->lx_write) || 269 (chan->lx_read != chan->lx_write) ||
270 sp_stopping, ret); 270 sp_stopping);
271 if (ret) 271 if (ret)
272 return ret; 272 return ret;
273 273
@@ -440,14 +440,13 @@ static ssize_t file_write(struct file *file, const char __user * buffer,
440 440
441 /* any space left... */ 441 /* any space left... */
442 if (!rtlx_write_poll(minor)) { 442 if (!rtlx_write_poll(minor)) {
443 int ret = 0; 443 int ret;
444 444
445 if (file->f_flags & O_NONBLOCK) 445 if (file->f_flags & O_NONBLOCK)
446 return -EAGAIN; 446 return -EAGAIN;
447 447
448 __wait_event_interruptible(channel_wqs[minor].rt_queue, 448 ret = __wait_event_interruptible(channel_wqs[minor].rt_queue,
449 rtlx_write_poll(minor), 449 rtlx_write_poll(minor));
450 ret);
451 if (ret) 450 if (ret)
452 return ret; 451 return ret;
453 } 452 }
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index e205ef598e97..12156176c7ca 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr)
124 124
125 BUG_ON(Page_dcache_dirty(page)); 125 BUG_ON(Page_dcache_dirty(page));
126 126
127 inc_preempt_count(); 127 pagefault_disable();
128 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); 128 idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
129#ifdef CONFIG_MIPS_MT_SMTC 129#ifdef CONFIG_MIPS_MT_SMTC
130 idx += FIX_N_COLOURS * smp_processor_id() + 130 idx += FIX_N_COLOURS * smp_processor_id() +
@@ -193,8 +193,7 @@ void kunmap_coherent(void)
193 write_c0_entryhi(old_ctx); 193 write_c0_entryhi(old_ctx);
194 EXIT_CRITICAL(flags); 194 EXIT_CRITICAL(flags);
195#endif 195#endif
196 dec_preempt_count(); 196 pagefault_enable();
197 preempt_check_resched();
198} 197}
199 198
200void copy_user_highpage(struct page *to, struct page *from, 199void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index c5d767028306..74742dc6a3da 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -2,3 +2,4 @@
2generic-y += clkdev.h 2generic-y += clkdev.h
3generic-y += exec.h 3generic-y += exec.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 195653e851da..78405625e799 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -67,3 +67,4 @@ generic-y += ucontext.h
67generic-y += user.h 67generic-y += user.h
68generic-y += word-at-a-time.h 68generic-y += word-at-a-time.h
69generic-y += xor.h 69generic-y += xor.h
70generic-y += preempt.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index ff4c9faed546..a603b9ebe54c 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \
4 div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ 4 div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \
5 poll.h xor.h clkdev.h exec.h 5 poll.h xor.h clkdev.h exec.h
6generic-y += trace_clock.h 6generic-y += trace_clock.h
7generic-y += preempt.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 704e6f10ae80..d8f9d2f18a23 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -2,4 +2,5 @@
2generic-y += clkdev.h 2generic-y += clkdev.h
3generic-y += rwsem.h 3generic-y += rwsem.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
5generic-y += vtime.h \ No newline at end of file 6generic-y += vtime.h \ No newline at end of file
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index f313f9cbcf44..7a5288f3479a 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -2,3 +2,4 @@
2 2
3generic-y += clkdev.h 3generic-y += clkdev.h
4generic-y += trace_clock.h 4generic-y += trace_clock.h
5generic-y += preempt.h
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index e1c7bb999b06..f3414ade77a3 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -4,3 +4,4 @@ header-y +=
4generic-y += clkdev.h 4generic-y += clkdev.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += xor.h 6generic-y += xor.h
7generic-y += preempt.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 280bea9e5e2b..231efbb68108 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -34,3 +34,4 @@ generic-y += termios.h
34generic-y += trace_clock.h 34generic-y += trace_clock.h
35generic-y += ucontext.h 35generic-y += ucontext.h
36generic-y += xor.h 36generic-y += xor.h
37generic-y += preempt.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 7e4a97fbded4..bf390667657a 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,3 +16,4 @@ generic-y += serial.h
16generic-y += trace_clock.h 16generic-y += trace_clock.h
17generic-y += types.h 17generic-y += types.h
18generic-y += word-at-a-time.h 18generic-y += word-at-a-time.h
19generic-y += preempt.h
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 664d6ad23f80..22f3bd147fa7 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -38,3 +38,4 @@ generic-y += termios.h
38generic-y += trace_clock.h 38generic-y += trace_clock.h
39generic-y += types.h 39generic-y += types.h
40generic-y += xor.h 40generic-y += xor.h
41generic-y += preempt.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b30f34a79882..fdde187e6087 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h
3generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h 3generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
4generic-y += switch_to.h clkdev.h 4generic-y += switch_to.h clkdev.h
5generic-y += trace_clock.h 5generic-y += trace_clock.h
6generic-y += preempt.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 89d8b6c4e39a..00045cbe5c63 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -60,3 +60,4 @@ generic-y += unaligned.h
60generic-y += user.h 60generic-y += user.h
61generic-y += vga.h 61generic-y += vga.h
62generic-y += xor.h 62generic-y += xor.h
63generic-y += preempt.h
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 722aa3b04624..da31c8b8a92d 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/alternative.h> 7#include <asm/alternative.h>
8#include <asm/cmpxchg.h> 8#include <asm/cmpxchg.h>
9#include <asm/rmwcc.h>
9 10
10/* 11/*
11 * Atomic operations that C can't guarantee us. Useful for 12 * Atomic operations that C can't guarantee us. Useful for
@@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v)
76 */ 77 */
77static inline int atomic_sub_and_test(int i, atomic_t *v) 78static inline int atomic_sub_and_test(int i, atomic_t *v)
78{ 79{
79 unsigned char c; 80 GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e");
80
81 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
82 : "+m" (v->counter), "=qm" (c)
83 : "ir" (i) : "memory");
84 return c;
85} 81}
86 82
87/** 83/**
@@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v)
118 */ 114 */
119static inline int atomic_dec_and_test(atomic_t *v) 115static inline int atomic_dec_and_test(atomic_t *v)
120{ 116{
121 unsigned char c; 117 GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "+m" (v->counter), "=qm" (c)
125 : : "memory");
126 return c != 0;
127} 118}
128 119
129/** 120/**
@@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
136 */ 127 */
137static inline int atomic_inc_and_test(atomic_t *v) 128static inline int atomic_inc_and_test(atomic_t *v)
138{ 129{
139 unsigned char c; 130 GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "+m" (v->counter), "=qm" (c)
143 : : "memory");
144 return c != 0;
145} 131}
146 132
147/** 133/**
@@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
155 */ 141 */
156static inline int atomic_add_negative(int i, atomic_t *v) 142static inline int atomic_add_negative(int i, atomic_t *v)
157{ 143{
158 unsigned char c; 144 GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s");
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "+m" (v->counter), "=qm" (c)
162 : "ir" (i) : "memory");
163 return c;
164} 145}
165 146
166/** 147/**
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 0e1cbfc8ee06..3f065c985aee 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
72 */ 72 */
73static inline int atomic64_sub_and_test(long i, atomic64_t *v) 73static inline int atomic64_sub_and_test(long i, atomic64_t *v)
74{ 74{
75 unsigned char c; 75 GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e");
76
77 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
78 : "=m" (v->counter), "=qm" (c)
79 : "er" (i), "m" (v->counter) : "memory");
80 return c;
81} 76}
82 77
83/** 78/**
@@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v)
116 */ 111 */
117static inline int atomic64_dec_and_test(atomic64_t *v) 112static inline int atomic64_dec_and_test(atomic64_t *v)
118{ 113{
119 unsigned char c; 114 GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e");
120
121 asm volatile(LOCK_PREFIX "decq %0; sete %1"
122 : "=m" (v->counter), "=qm" (c)
123 : "m" (v->counter) : "memory");
124 return c != 0;
125} 115}
126 116
127/** 117/**
@@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v)
134 */ 124 */
135static inline int atomic64_inc_and_test(atomic64_t *v) 125static inline int atomic64_inc_and_test(atomic64_t *v)
136{ 126{
137 unsigned char c; 127 GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e");
138
139 asm volatile(LOCK_PREFIX "incq %0; sete %1"
140 : "=m" (v->counter), "=qm" (c)
141 : "m" (v->counter) : "memory");
142 return c != 0;
143} 128}
144 129
145/** 130/**
@@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v)
153 */ 138 */
154static inline int atomic64_add_negative(long i, atomic64_t *v) 139static inline int atomic64_add_negative(long i, atomic64_t *v)
155{ 140{
156 unsigned char c; 141 GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s");
157
158 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
159 : "=m" (v->counter), "=qm" (c)
160 : "er" (i), "m" (v->counter) : "memory");
161 return c;
162} 142}
163 143
164/** 144/**
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 41639ce8fd63..6d76d0935989 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -14,6 +14,7 @@
14 14
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <asm/alternative.h> 16#include <asm/alternative.h>
17#include <asm/rmwcc.h>
17 18
18#if BITS_PER_LONG == 32 19#if BITS_PER_LONG == 32
19# define _BITOPS_LONG_SHIFT 5 20# define _BITOPS_LONG_SHIFT 5
@@ -204,12 +205,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
204 */ 205 */
205static inline int test_and_set_bit(long nr, volatile unsigned long *addr) 206static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
206{ 207{
207 int oldbit; 208 GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c");
208
209 asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
210 "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
211
212 return oldbit;
213} 209}
214 210
215/** 211/**
@@ -255,13 +251,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
255 */ 251 */
256static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) 252static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
257{ 253{
258 int oldbit; 254 GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c");
259
260 asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
261 "sbb %0,%0"
262 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
263
264 return oldbit;
265} 255}
266 256
267/** 257/**
@@ -314,13 +304,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
314 */ 304 */
315static inline int test_and_change_bit(long nr, volatile unsigned long *addr) 305static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
316{ 306{
317 int oldbit; 307 GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c");
318
319 asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
320 "sbb %0,%0"
321 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
322
323 return oldbit;
324} 308}
325 309
326static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) 310static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0fa675033912..cb4c73bfeb48 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with
48 48
49#include <asm/dwarf2.h> 49#include <asm/dwarf2.h>
50 50
51#ifdef CONFIG_X86_64
52
51/* 53/*
52 * 64-bit system call stack frame layout defines and helpers, 54 * 64-bit system call stack frame layout defines and helpers,
53 * for assembly code: 55 * for assembly code:
@@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with
192 .macro icebp 194 .macro icebp
193 .byte 0xf1 195 .byte 0xf1
194 .endm 196 .endm
197
198#else /* CONFIG_X86_64 */
199
200/*
201 * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
202 * are different from the entry_32.S versions in not changing the segment
203 * registers. So only suitable for in kernel use, not when transitioning
204 * from or to user space. The resulting stack frame is not a standard
205 * pt_regs frame. The main use case is calling C code from assembler
206 * when all the registers need to be preserved.
207 */
208
209 .macro SAVE_ALL
210 pushl_cfi %eax
211 CFI_REL_OFFSET eax, 0
212 pushl_cfi %ebp
213 CFI_REL_OFFSET ebp, 0
214 pushl_cfi %edi
215 CFI_REL_OFFSET edi, 0
216 pushl_cfi %esi
217 CFI_REL_OFFSET esi, 0
218 pushl_cfi %edx
219 CFI_REL_OFFSET edx, 0
220 pushl_cfi %ecx
221 CFI_REL_OFFSET ecx, 0
222 pushl_cfi %ebx
223 CFI_REL_OFFSET ebx, 0
224 .endm
225
226 .macro RESTORE_ALL
227 popl_cfi %ebx
228 CFI_RESTORE ebx
229 popl_cfi %ecx
230 CFI_RESTORE ecx
231 popl_cfi %edx
232 CFI_RESTORE edx
233 popl_cfi %esi
234 CFI_RESTORE esi
235 popl_cfi %edi
236 CFI_RESTORE edi
237 popl_cfi %ebp
238 CFI_RESTORE ebp
239 popl_cfi %eax
240 CFI_RESTORE eax
241 .endm
242
243#endif /* CONFIG_X86_64 */
244
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 2d89e3980cbd..5b23e605e707 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l)
52 */ 52 */
53static inline int local_sub_and_test(long i, local_t *l) 53static inline int local_sub_and_test(long i, local_t *l)
54{ 54{
55 unsigned char c; 55 GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e");
56
57 asm volatile(_ASM_SUB "%2,%0; sete %1"
58 : "+m" (l->a.counter), "=qm" (c)
59 : "ir" (i) : "memory");
60 return c;
61} 56}
62 57
63/** 58/**
@@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l)
70 */ 65 */
71static inline int local_dec_and_test(local_t *l) 66static inline int local_dec_and_test(local_t *l)
72{ 67{
73 unsigned char c; 68 GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e");
74
75 asm volatile(_ASM_DEC "%0; sete %1"
76 : "+m" (l->a.counter), "=qm" (c)
77 : : "memory");
78 return c != 0;
79} 69}
80 70
81/** 71/**
@@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l)
88 */ 78 */
89static inline int local_inc_and_test(local_t *l) 79static inline int local_inc_and_test(local_t *l)
90{ 80{
91 unsigned char c; 81 GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e");
92
93 asm volatile(_ASM_INC "%0; sete %1"
94 : "+m" (l->a.counter), "=qm" (c)
95 : : "memory");
96 return c != 0;
97} 82}
98 83
99/** 84/**
@@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l)
107 */ 92 */
108static inline int local_add_negative(long i, local_t *l) 93static inline int local_add_negative(long i, local_t *l)
109{ 94{
110 unsigned char c; 95 GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s");
111
112 asm volatile(_ASM_ADD "%2,%0; sets %1"
113 : "+m" (l->a.counter), "=qm" (c)
114 : "ir" (i) : "memory");
115 return c;
116} 96}
117 97
118/** 98/**
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
new file mode 100644
index 000000000000..8729723636fd
--- /dev/null
+++ b/arch/x86/include/asm/preempt.h
@@ -0,0 +1,100 @@
1#ifndef __ASM_PREEMPT_H
2#define __ASM_PREEMPT_H
3
4#include <asm/rmwcc.h>
5#include <asm/percpu.h>
6#include <linux/thread_info.h>
7
8DECLARE_PER_CPU(int, __preempt_count);
9
10/*
11 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
12 * that think a non-zero value indicates we cannot preempt.
13 */
14static __always_inline int preempt_count(void)
15{
16 return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
17}
18
19static __always_inline void preempt_count_set(int pc)
20{
21 __this_cpu_write_4(__preempt_count, pc);
22}
23
24/*
25 * must be macros to avoid header recursion hell
26 */
27#define task_preempt_count(p) \
28 (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED)
29
30#define init_task_preempt_count(p) do { \
31 task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
32} while (0)
33
34#define init_idle_preempt_count(p, cpu) do { \
35 task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
36 per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
37} while (0)
38
39/*
40 * We fold the NEED_RESCHED bit into the preempt count such that
41 * preempt_enable() can decrement and test for needing to reschedule with a
42 * single instruction.
43 *
44 * We invert the actual bit, so that when the decrement hits 0 we know we both
45 * need to resched (the bit is cleared) and can resched (no preempt count).
46 */
47
48static __always_inline void set_preempt_need_resched(void)
49{
50 __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
51}
52
53static __always_inline void clear_preempt_need_resched(void)
54{
55 __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
56}
57
58static __always_inline bool test_preempt_need_resched(void)
59{
60 return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
61}
62
63/*
64 * The various preempt_count add/sub methods
65 */
66
67static __always_inline void __preempt_count_add(int val)
68{
69 __this_cpu_add_4(__preempt_count, val);
70}
71
72static __always_inline void __preempt_count_sub(int val)
73{
74 __this_cpu_add_4(__preempt_count, -val);
75}
76
77static __always_inline bool __preempt_count_dec_and_test(void)
78{
79 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
80}
81
82/*
83 * Returns true when we need to resched and can (barring IRQ state).
84 */
85static __always_inline bool should_resched(void)
86{
87 return unlikely(!__this_cpu_read_4(__preempt_count));
88}
89
90#ifdef CONFIG_PREEMPT
91 extern asmlinkage void ___preempt_schedule(void);
92# define __preempt_schedule() asm ("call ___preempt_schedule")
93 extern asmlinkage void preempt_schedule(void);
94# ifdef CONFIG_CONTEXT_TRACKING
95 extern asmlinkage void ___preempt_schedule_context(void);
96# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
97# endif
98#endif
99
100#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
new file mode 100644
index 000000000000..735f1849795f
--- /dev/null
+++ b/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,41 @@
1#ifndef _ASM_X86_RMWcc
2#define _ASM_X86_RMWcc
3
4#ifdef CC_HAVE_ASM_GOTO
5
6#define __GEN_RMWcc(fullop, var, cc, ...) \
7do { \
8 asm volatile goto (fullop "; j" cc " %l[cc_label]" \
9 : : "m" (var), ## __VA_ARGS__ \
10 : "memory" : cc_label); \
11 return 0; \
12cc_label: \
13 return 1; \
14} while (0)
15
16#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
17 __GEN_RMWcc(op " " arg0, var, cc)
18
19#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \
20 __GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val))
21
22#else /* !CC_HAVE_ASM_GOTO */
23
24#define __GEN_RMWcc(fullop, var, cc, ...) \
25do { \
26 char c; \
27 asm volatile (fullop "; set" cc " %1" \
28 : "+m" (var), "=qm" (c) \
29 : __VA_ARGS__ : "memory"); \
30 return c != 0; \
31} while (0)
32
33#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
34 __GEN_RMWcc(op " " arg0, var, cc)
35
36#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \
37 __GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val))
38
39#endif /* CC_HAVE_ASM_GOTO */
40
41#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 27811190cbd7..c46a46be1ec6 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -28,8 +28,7 @@ struct thread_info {
28 __u32 flags; /* low level flags */ 28 __u32 flags; /* low level flags */
29 __u32 status; /* thread synchronous flags */ 29 __u32 status; /* thread synchronous flags */
30 __u32 cpu; /* current CPU */ 30 __u32 cpu; /* current CPU */
31 int preempt_count; /* 0 => preemptable, 31 int saved_preempt_count;
32 <0 => BUG */
33 mm_segment_t addr_limit; 32 mm_segment_t addr_limit;
34 struct restart_block restart_block; 33 struct restart_block restart_block;
35 void __user *sysenter_return; 34 void __user *sysenter_return;
@@ -49,7 +48,7 @@ struct thread_info {
49 .exec_domain = &default_exec_domain, \ 48 .exec_domain = &default_exec_domain, \
50 .flags = 0, \ 49 .flags = 0, \
51 .cpu = 0, \ 50 .cpu = 0, \
52 .preempt_count = INIT_PREEMPT_COUNT, \ 51 .saved_preempt_count = INIT_PREEMPT_COUNT, \
53 .addr_limit = KERNEL_DS, \ 52 .addr_limit = KERNEL_DS, \
54 .restart_block = { \ 53 .restart_block = { \
55 .fn = do_no_restart_syscall, \ 54 .fn = do_no_restart_syscall, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a5408b965c9d..9b0a34e2cd79 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o
36obj-y += pci-iommu_table.o 36obj-y += pci-iommu_table.o
37obj-y += resource.o 37obj-y += resource.o
38 38
39obj-$(CONFIG_PREEMPT) += preempt.o
40
39obj-y += process.o 41obj-y += process.o
40obj-y += i387.o xsave.o 42obj-y += i387.o xsave.o
41obj-y += ptrace.o 43obj-y += ptrace.o
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 28610822fb3c..9f6b9341950f 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,6 @@ void common(void) {
32 OFFSET(TI_flags, thread_info, flags); 32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status); 33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit); 34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36 35
37 BLANK(); 36 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 37 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2793d1f095a2..5223fe6dec7b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1095 1095
1096DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; 1096DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
1097 1097
1098DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1099EXPORT_PER_CPU_SYMBOL(__preempt_count);
1100
1098DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1101DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1099 1102
1100/* 1103/*
@@ -1169,6 +1172,8 @@ void debug_stack_reset(void)
1169 1172
1170DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1173DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1171EXPORT_PER_CPU_SYMBOL(current_task); 1174EXPORT_PER_CPU_SYMBOL(current_task);
1175DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1176EXPORT_PER_CPU_SYMBOL(__preempt_count);
1172DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1177DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1173 1178
1174#ifdef CONFIG_CC_STACKPROTECTOR 1179#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0dcb0ceb6a2..fd1bc1b15e6d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -362,12 +362,9 @@ END(ret_from_exception)
362#ifdef CONFIG_PREEMPT 362#ifdef CONFIG_PREEMPT
363ENTRY(resume_kernel) 363ENTRY(resume_kernel)
364 DISABLE_INTERRUPTS(CLBR_ANY) 364 DISABLE_INTERRUPTS(CLBR_ANY)
365 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
366 jnz restore_all
367need_resched: 365need_resched:
368 movl TI_flags(%ebp), %ecx # need_resched set ? 366 cmpl $0,PER_CPU_VAR(__preempt_count)
369 testb $_TIF_NEED_RESCHED, %cl 367 jnz restore_all
370 jz restore_all
371 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? 368 testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
372 jz restore_all 369 jz restore_all
373 call preempt_schedule_irq 370 call preempt_schedule_irq
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b077f4cc225a..1a2cc64abcd7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1103,10 +1103,8 @@ retint_signal:
1103 /* Returning to kernel space. Check if we need preemption */ 1103 /* Returning to kernel space. Check if we need preemption */
1104 /* rcx: threadinfo. interrupts off. */ 1104 /* rcx: threadinfo. interrupts off. */
1105ENTRY(retint_kernel) 1105ENTRY(retint_kernel)
1106 cmpl $0,TI_preempt_count(%rcx) 1106 cmpl $0,PER_CPU_VAR(__preempt_count)
1107 jnz retint_restore_args 1107 jnz retint_restore_args
1108 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
1109 jnc retint_restore_args
1110 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 1108 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
1111 jnc retint_restore_args 1109 jnc retint_restore_args
1112 call preempt_schedule_irq 1110 call preempt_schedule_irq
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 0fa69127209a..05fd74f537d6 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr);
37 37
38EXPORT_SYMBOL(csum_partial); 38EXPORT_SYMBOL(csum_partial);
39EXPORT_SYMBOL(empty_zero_page); 39EXPORT_SYMBOL(empty_zero_page);
40
41#ifdef CONFIG_PREEMPT
42EXPORT_SYMBOL(___preempt_schedule);
43#ifdef CONFIG_CONTEXT_TRACKING
44EXPORT_SYMBOL(___preempt_schedule_context);
45#endif
46#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 4186755f1d7c..3fe066359ac0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
100 irqctx->tinfo.task = curctx->tinfo.task; 100 irqctx->tinfo.task = curctx->tinfo.task;
101 irqctx->tinfo.previous_esp = current_stack_pointer; 101 irqctx->tinfo.previous_esp = current_stack_pointer;
102 102
103 /* Copy the preempt_count so that the [soft]irq checks work. */
104 irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
105
106 if (unlikely(overflow)) 103 if (unlikely(overflow))
107 call_on_stack(print_stack_overflow, isp); 104 call_on_stack(print_stack_overflow, isp);
108 105
@@ -131,7 +128,6 @@ void irq_ctx_init(int cpu)
131 THREAD_SIZE_ORDER)); 128 THREAD_SIZE_ORDER));
132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 129 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.cpu = cpu; 130 irqctx->tinfo.cpu = cpu;
134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 131 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
136 132
137 per_cpu(hardirq_ctx, cpu) = irqctx; 133 per_cpu(hardirq_ctx, cpu) = irqctx;
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
new file mode 100644
index 000000000000..ca7f0d58a87d
--- /dev/null
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
1
2#include <linux/linkage.h>
3#include <asm/dwarf2.h>
4#include <asm/asm.h>
5#include <asm/calling.h>
6
7ENTRY(___preempt_schedule)
8 CFI_STARTPROC
9 SAVE_ALL
10 call preempt_schedule
11 RESTORE_ALL
12 ret
13 CFI_ENDPROC
14
15#ifdef CONFIG_CONTEXT_TRACKING
16
17ENTRY(___preempt_schedule_context)
18 CFI_STARTPROC
19 SAVE_ALL
20 call preempt_schedule_context
21 RESTORE_ALL
22 ret
23 CFI_ENDPROC
24
25#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c83516be1052..3fb8d95ab8b5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -391,9 +391,9 @@ static void amd_e400_idle(void)
391 * The switch back from broadcast mode needs to be 391 * The switch back from broadcast mode needs to be
392 * called with interrupts disabled. 392 * called with interrupts disabled.
393 */ 393 */
394 local_irq_disable(); 394 local_irq_disable();
395 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 395 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
396 local_irq_enable(); 396 local_irq_enable();
397 } else 397 } else
398 default_idle(); 398 default_idle();
399} 399}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 884f98f69354..c2ec1aa6d454 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -292,6 +292,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
292 set_iopl_mask(next->iopl); 292 set_iopl_mask(next->iopl);
293 293
294 /* 294 /*
295 * If it were not for PREEMPT_ACTIVE we could guarantee that the
296 * preempt_count of all tasks was equal here and this would not be
297 * needed.
298 */
299 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
300 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
301
302 /*
295 * Now maybe handle debug registers and/or IO bitmaps 303 * Now maybe handle debug registers and/or IO bitmaps
296 */ 304 */
297 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || 305 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bb1dc51bab05..45ab4d6fc8a7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
363 this_cpu_write(old_rsp, next->usersp); 363 this_cpu_write(old_rsp, next->usersp);
364 this_cpu_write(current_task, next_p); 364 this_cpu_write(current_task, next_p);
365 365
366 /*
367 * If it were not for PREEMPT_ACTIVE we could guarantee that the
368 * preempt_count of all tasks was equal here and this would not be
369 * needed.
370 */
371 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
372 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
373
366 this_cpu_write(kernel_stack, 374 this_cpu_write(kernel_stack,
367 (unsigned long)task_stack_page(next_p) + 375 (unsigned long)task_stack_page(next_p) +
368 THREAD_SIZE - KERNEL_STACK_OFFSET); 376 THREAD_SIZE - KERNEL_STACK_OFFSET);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8c8093b146ca..729aa779ff75 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs)
88 88
89static inline void preempt_conditional_sti(struct pt_regs *regs) 89static inline void preempt_conditional_sti(struct pt_regs *regs)
90{ 90{
91 inc_preempt_count(); 91 preempt_count_inc();
92 if (regs->flags & X86_EFLAGS_IF) 92 if (regs->flags & X86_EFLAGS_IF)
93 local_irq_enable(); 93 local_irq_enable();
94} 94}
@@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
103{ 103{
104 if (regs->flags & X86_EFLAGS_IF) 104 if (regs->flags & X86_EFLAGS_IF)
105 local_irq_disable(); 105 local_irq_disable();
106 dec_preempt_count(); 106 preempt_count_dec();
107} 107}
108 108
109static int __kprobes 109static int __kprobes
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index b014d9414d08..040681928e9d 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page);
66#ifndef CONFIG_PARAVIRT 66#ifndef CONFIG_PARAVIRT
67EXPORT_SYMBOL(native_load_gs_index); 67EXPORT_SYMBOL(native_load_gs_index);
68#endif 68#endif
69
70#ifdef CONFIG_PREEMPT
71EXPORT_SYMBOL(___preempt_schedule);
72#ifdef CONFIG_CONTEXT_TRACKING
73EXPORT_SYMBOL(___preempt_schedule_context);
74#endif
75#endif
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 1b982641ec35..228d6aee3a16 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -28,3 +28,4 @@ generic-y += termios.h
28generic-y += topology.h 28generic-y += topology.h
29generic-y += trace_clock.h 29generic-y += trace_clock.h
30generic-y += xor.h 30generic-y += xor.h
31generic-y += preempt.h
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index f98dd00b51a9..c7414a545a4f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = {
119 */ 119 */
120static void acpi_safe_halt(void) 120static void acpi_safe_halt(void)
121{ 121{
122 current_thread_info()->status &= ~TS_POLLING; 122 if (!tif_need_resched()) {
123 /*
124 * TS_POLLING-cleared state must be visible before we
125 * test NEED_RESCHED:
126 */
127 smp_mb();
128 if (!need_resched()) {
129 safe_halt(); 123 safe_halt();
130 local_irq_disable(); 124 local_irq_disable();
131 } 125 }
132 current_thread_info()->status |= TS_POLLING;
133} 126}
134 127
135#ifdef ARCH_APICTIMER_STOPS_ON_C3 128#ifdef ARCH_APICTIMER_STOPS_ON_C3
@@ -737,6 +730,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
737 if (unlikely(!pr)) 730 if (unlikely(!pr))
738 return -EINVAL; 731 return -EINVAL;
739 732
733 if (cx->entry_method == ACPI_CSTATE_FFH) {
734 if (current_set_polling_and_test())
735 return -EINVAL;
736 }
737
740 lapic_timer_state_broadcast(pr, cx, 1); 738 lapic_timer_state_broadcast(pr, cx, 1);
741 acpi_idle_do_entry(cx); 739 acpi_idle_do_entry(cx);
742 740
@@ -790,18 +788,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
790 if (unlikely(!pr)) 788 if (unlikely(!pr))
791 return -EINVAL; 789 return -EINVAL;
792 790
793 if (cx->entry_method != ACPI_CSTATE_FFH) { 791 if (cx->entry_method == ACPI_CSTATE_FFH) {
794 current_thread_info()->status &= ~TS_POLLING; 792 if (current_set_polling_and_test())
795 /*
796 * TS_POLLING-cleared state must be visible before we test
797 * NEED_RESCHED:
798 */
799 smp_mb();
800
801 if (unlikely(need_resched())) {
802 current_thread_info()->status |= TS_POLLING;
803 return -EINVAL; 793 return -EINVAL;
804 }
805 } 794 }
806 795
807 /* 796 /*
@@ -819,9 +808,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
819 808
820 sched_clock_idle_wakeup_event(0); 809 sched_clock_idle_wakeup_event(0);
821 810
822 if (cx->entry_method != ACPI_CSTATE_FFH)
823 current_thread_info()->status |= TS_POLLING;
824
825 lapic_timer_state_broadcast(pr, cx, 0); 811 lapic_timer_state_broadcast(pr, cx, 0);
826 return index; 812 return index;
827} 813}
@@ -858,18 +844,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
858 } 844 }
859 } 845 }
860 846
861 if (cx->entry_method != ACPI_CSTATE_FFH) { 847 if (cx->entry_method == ACPI_CSTATE_FFH) {
862 current_thread_info()->status &= ~TS_POLLING; 848 if (current_set_polling_and_test())
863 /*
864 * TS_POLLING-cleared state must be visible before we test
865 * NEED_RESCHED:
866 */
867 smp_mb();
868
869 if (unlikely(need_resched())) {
870 current_thread_info()->status |= TS_POLLING;
871 return -EINVAL; 849 return -EINVAL;
872 }
873 } 850 }
874 851
875 acpi_unlazy_tlb(smp_processor_id()); 852 acpi_unlazy_tlb(smp_processor_id());
@@ -915,9 +892,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
915 892
916 sched_clock_idle_wakeup_event(0); 893 sched_clock_idle_wakeup_event(0);
917 894
918 if (cx->entry_method != ACPI_CSTATE_FFH)
919 current_thread_info()->status |= TS_POLLING;
920
921 lapic_timer_state_broadcast(pr, cx, 0); 895 lapic_timer_state_broadcast(pr, cx, 0);
922 return index; 896 return index;
923} 897}
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index fa6964d8681a..f116d664b473 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -359,7 +359,7 @@ static int intel_idle(struct cpuidle_device *dev,
359 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 359 if (!(lapic_timer_reliable_states & (1 << (cstate))))
360 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 360 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
361 361
362 if (!need_resched()) { 362 if (!current_set_polling_and_test()) {
363 363
364 __monitor((void *)&current_thread_info()->flags, 0, 0); 364 __monitor((void *)&current_thread_info()->flags, 0, 0);
365 smp_mb(); 365 smp_mb();
diff --git a/fs/exec.c b/fs/exec.c
index 8875dd10ae7a..2ea437e5acf4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename,
1547 current->fs->in_exec = 0; 1547 current->fs->in_exec = 0;
1548 current->in_execve = 0; 1548 current->in_execve = 0;
1549 acct_update_integrals(current); 1549 acct_update_integrals(current);
1550 task_numa_free(current);
1550 free_bprm(bprm); 1551 free_bprm(bprm);
1551 if (displaced) 1552 if (displaced)
1552 put_files_struct(displaced); 1553 put_files_struct(displaced);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cbd0f1b324b9..1bd2077187fd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
183 seq_printf(m, 183 seq_printf(m,
184 "State:\t%s\n" 184 "State:\t%s\n"
185 "Tgid:\t%d\n" 185 "Tgid:\t%d\n"
186 "Ngid:\t%d\n"
186 "Pid:\t%d\n" 187 "Pid:\t%d\n"
187 "PPid:\t%d\n" 188 "PPid:\t%d\n"
188 "TracerPid:\t%d\n" 189 "TracerPid:\t%d\n"
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
190 "Gid:\t%d\t%d\t%d\t%d\n", 191 "Gid:\t%d\t%d\t%d\t%d\n",
191 get_task_state(p), 192 get_task_state(p),
192 task_tgid_nr_ns(p, ns), 193 task_tgid_nr_ns(p, ns),
194 task_numa_group_id(p),
193 pid_nr_ns(pid, ns), 195 pid_nr_ns(pid, ns),
194 ppid, tpid, 196 ppid, tpid,
195 from_kuid_munged(user_ns, cred->uid), 197 from_kuid_munged(user_ns, cred->uid),
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
new file mode 100644
index 000000000000..ddf2b420ac8f
--- /dev/null
+++ b/include/asm-generic/preempt.h
@@ -0,0 +1,105 @@
1#ifndef __ASM_PREEMPT_H
2#define __ASM_PREEMPT_H
3
4#include <linux/thread_info.h>
5
6/*
7 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
8 * that think a non-zero value indicates we cannot preempt.
9 */
10static __always_inline int preempt_count(void)
11{
12 return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
13}
14
15static __always_inline int *preempt_count_ptr(void)
16{
17 return &current_thread_info()->preempt_count;
18}
19
20/*
21 * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
22 * alternative is loosing a reschedule. Better schedule too often -- also this
23 * should be a very rare operation.
24 */
25static __always_inline void preempt_count_set(int pc)
26{
27 *preempt_count_ptr() = pc;
28}
29
30/*
31 * must be macros to avoid header recursion hell
32 */
33#define task_preempt_count(p) \
34 (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED)
35
36#define init_task_preempt_count(p) do { \
37 task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
38} while (0)
39
40#define init_idle_preempt_count(p, cpu) do { \
41 task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
42} while (0)
43
44/*
45 * We fold the NEED_RESCHED bit into the preempt count such that
46 * preempt_enable() can decrement and test for needing to reschedule with a
47 * single instruction.
48 *
49 * We invert the actual bit, so that when the decrement hits 0 we know we both
50 * need to resched (the bit is cleared) and can resched (no preempt count).
51 */
52
53static __always_inline void set_preempt_need_resched(void)
54{
55 *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
56}
57
58static __always_inline void clear_preempt_need_resched(void)
59{
60 *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
61}
62
63static __always_inline bool test_preempt_need_resched(void)
64{
65 return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
66}
67
68/*
69 * The various preempt_count add/sub methods
70 */
71
72static __always_inline void __preempt_count_add(int val)
73{
74 *preempt_count_ptr() += val;
75}
76
77static __always_inline void __preempt_count_sub(int val)
78{
79 *preempt_count_ptr() -= val;
80}
81
82static __always_inline bool __preempt_count_dec_and_test(void)
83{
84 return !--*preempt_count_ptr();
85}
86
87/*
88 * Returns true when we need to resched and can (barring IRQ state).
89 */
90static __always_inline bool should_resched(void)
91{
92 return unlikely(!*preempt_count_ptr());
93}
94
95#ifdef CONFIG_PREEMPT
96extern asmlinkage void preempt_schedule(void);
97#define __preempt_schedule() preempt_schedule()
98
99#ifdef CONFIG_CONTEXT_TRACKING
100extern asmlinkage void preempt_schedule_context(void);
101#define __preempt_schedule_context() preempt_schedule_context()
102#endif
103#endif /* CONFIG_PREEMPT */
104
105#endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 1e041063b226..d9cf963ac832 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void);
33#define __irq_enter() \ 33#define __irq_enter() \
34 do { \ 34 do { \
35 account_irq_enter_time(current); \ 35 account_irq_enter_time(current); \
36 add_preempt_count(HARDIRQ_OFFSET); \ 36 preempt_count_add(HARDIRQ_OFFSET); \
37 trace_hardirq_enter(); \ 37 trace_hardirq_enter(); \
38 } while (0) 38 } while (0)
39 39
@@ -49,7 +49,7 @@ extern void irq_enter(void);
49 do { \ 49 do { \
50 trace_hardirq_exit(); \ 50 trace_hardirq_exit(); \
51 account_irq_exit_time(current); \ 51 account_irq_exit_time(current); \
52 sub_preempt_count(HARDIRQ_OFFSET); \ 52 preempt_count_sub(HARDIRQ_OFFSET); \
53 } while (0) 53 } while (0)
54 54
55/* 55/*
@@ -62,7 +62,7 @@ extern void irq_exit(void);
62 lockdep_off(); \ 62 lockdep_off(); \
63 ftrace_nmi_enter(); \ 63 ftrace_nmi_enter(); \
64 BUG_ON(in_nmi()); \ 64 BUG_ON(in_nmi()); \
65 add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ 65 preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
66 rcu_nmi_enter(); \ 66 rcu_nmi_enter(); \
67 trace_hardirq_enter(); \ 67 trace_hardirq_enter(); \
68 } while (0) 68 } while (0)
@@ -72,7 +72,7 @@ extern void irq_exit(void);
72 trace_hardirq_exit(); \ 72 trace_hardirq_exit(); \
73 rcu_nmi_exit(); \ 73 rcu_nmi_exit(); \
74 BUG_ON(!in_nmi()); \ 74 BUG_ON(!in_nmi()); \
75 sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ 75 preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
76 ftrace_nmi_exit(); \ 76 ftrace_nmi_exit(); \
77 lockdep_on(); \ 77 lockdep_on(); \
78 } while (0) 78 } while (0)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index da6716b9e3fe..ea4d2495c646 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
136 136
137struct mempolicy *get_vma_policy(struct task_struct *tsk, 137struct mempolicy *get_vma_policy(struct task_struct *tsk,
138 struct vm_area_struct *vma, unsigned long addr); 138 struct vm_area_struct *vma, unsigned long addr);
139bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma);
139 140
140extern void numa_default_policy(void); 141extern void numa_default_policy(void);
141extern void numa_policy_init(void); 142extern void numa_policy_init(void);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8d3c57fdf221..f5096b58b20d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
90#endif /* CONFIG_MIGRATION */ 90#endif /* CONFIG_MIGRATION */
91 91
92#ifdef CONFIG_NUMA_BALANCING 92#ifdef CONFIG_NUMA_BALANCING
93extern int migrate_misplaced_page(struct page *page, int node); 93extern int migrate_misplaced_page(struct page *page,
94extern int migrate_misplaced_page(struct page *page, int node); 94 struct vm_area_struct *vma, int node);
95extern bool migrate_ratelimited(int node); 95extern bool migrate_ratelimited(int node);
96#else 96#else
97static inline int migrate_misplaced_page(struct page *page, int node) 97static inline int migrate_misplaced_page(struct page *page,
98 struct vm_area_struct *vma, int node)
98{ 99{
99 return -EAGAIN; /* can't migrate now */ 100 return -EAGAIN; /* can't migrate now */
100} 101}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55ee8855..81443d557a2e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
581 * sets it, so none of the operations on it need to be atomic. 581 * sets it, so none of the operations on it need to be atomic.
582 */ 582 */
583 583
584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ 584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) 585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) 586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) 587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
588#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) 588#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
589 589
590/* 590/*
591 * Define the bit shifts to access each section. For non-existent 591 * Define the bit shifts to access each section. For non-existent
@@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) 595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) 596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) 597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
598#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) 598#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
599 599
600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ 600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
601#ifdef NODE_NOT_IN_PAGE_FLAGS 601#ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) 617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
618#define NODES_MASK ((1UL << NODES_WIDTH) - 1) 618#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
620#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) 620#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1)
621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) 621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
622 622
623static inline enum zone_type page_zonenum(const struct page *page) 623static inline enum zone_type page_zonenum(const struct page *page)
@@ -661,51 +661,117 @@ static inline int page_to_nid(const struct page *page)
661#endif 661#endif
662 662
663#ifdef CONFIG_NUMA_BALANCING 663#ifdef CONFIG_NUMA_BALANCING
664#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 664static inline int cpu_pid_to_cpupid(int cpu, int pid)
665static inline int page_nid_xchg_last(struct page *page, int nid)
666{ 665{
667 return xchg(&page->_last_nid, nid); 666 return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
668} 667}
669 668
670static inline int page_nid_last(struct page *page) 669static inline int cpupid_to_pid(int cpupid)
671{ 670{
672 return page->_last_nid; 671 return cpupid & LAST__PID_MASK;
673} 672}
674static inline void page_nid_reset_last(struct page *page) 673
674static inline int cpupid_to_cpu(int cpupid)
675{ 675{
676 page->_last_nid = -1; 676 return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
677} 677}
678#else 678
679static inline int page_nid_last(struct page *page) 679static inline int cpupid_to_nid(int cpupid)
680{ 680{
681 return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; 681 return cpu_to_node(cpupid_to_cpu(cpupid));
682} 682}
683 683
684extern int page_nid_xchg_last(struct page *page, int nid); 684static inline bool cpupid_pid_unset(int cpupid)
685{
686 return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
687}
685 688
686static inline void page_nid_reset_last(struct page *page) 689static inline bool cpupid_cpu_unset(int cpupid)
687{ 690{
688 int nid = (1 << LAST_NID_SHIFT) - 1; 691 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
692}
689 693
690 page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 694static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
691 page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 695{
696 return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
697}
698
699#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
700#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
701static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
702{
703 return xchg(&page->_last_cpupid, cpupid);
704}
705
706static inline int page_cpupid_last(struct page *page)
707{
708 return page->_last_cpupid;
709}
710static inline void page_cpupid_reset_last(struct page *page)
711{
712 page->_last_cpupid = -1;
692} 713}
693#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
694#else 714#else
695static inline int page_nid_xchg_last(struct page *page, int nid) 715static inline int page_cpupid_last(struct page *page)
696{ 716{
697 return page_to_nid(page); 717 return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
698} 718}
699 719
700static inline int page_nid_last(struct page *page) 720extern int page_cpupid_xchg_last(struct page *page, int cpupid);
721
722static inline void page_cpupid_reset_last(struct page *page)
701{ 723{
702 return page_to_nid(page); 724 int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
725
726 page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
727 page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
728}
729#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
730#else /* !CONFIG_NUMA_BALANCING */
731static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
732{
733 return page_to_nid(page); /* XXX */
703} 734}
704 735
705static inline void page_nid_reset_last(struct page *page) 736static inline int page_cpupid_last(struct page *page)
706{ 737{
738 return page_to_nid(page); /* XXX */
707} 739}
708#endif 740
741static inline int cpupid_to_nid(int cpupid)
742{
743 return -1;
744}
745
746static inline int cpupid_to_pid(int cpupid)
747{
748 return -1;
749}
750
751static inline int cpupid_to_cpu(int cpupid)
752{
753 return -1;
754}
755
756static inline int cpu_pid_to_cpupid(int nid, int pid)
757{
758 return -1;
759}
760
761static inline bool cpupid_pid_unset(int cpupid)
762{
763 return 1;
764}
765
766static inline void page_cpupid_reset_last(struct page *page)
767{
768}
769
770static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
771{
772 return false;
773}
774#endif /* CONFIG_NUMA_BALANCING */
709 775
710static inline struct zone *page_zone(const struct page *page) 776static inline struct zone *page_zone(const struct page *page)
711{ 777{
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d9851eeb6e1d..a3198e5aaf4e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -174,8 +174,8 @@ struct page {
174 void *shadow; 174 void *shadow;
175#endif 175#endif
176 176
177#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 177#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
178 int _last_nid; 178 int _last_cpupid;
179#endif 179#endif
180} 180}
181/* 181/*
@@ -420,28 +420,15 @@ struct mm_struct {
420 */ 420 */
421 unsigned long numa_next_scan; 421 unsigned long numa_next_scan;
422 422
423 /* numa_next_reset is when the PTE scanner period will be reset */
424 unsigned long numa_next_reset;
425
426 /* Restart point for scanning and setting pte_numa */ 423 /* Restart point for scanning and setting pte_numa */
427 unsigned long numa_scan_offset; 424 unsigned long numa_scan_offset;
428 425
429 /* numa_scan_seq prevents two threads setting pte_numa */ 426 /* numa_scan_seq prevents two threads setting pte_numa */
430 int numa_scan_seq; 427 int numa_scan_seq;
431
432 /*
433 * The first node a task was scheduled on. If a task runs on
434 * a different node than Make PTE Scan Go Now.
435 */
436 int first_nid;
437#endif 428#endif
438 struct uprobes_state uprobes_state; 429 struct uprobes_state uprobes_state;
439}; 430};
440 431
441/* first nid will either be a valid NID or one of these values */
442#define NUMA_PTE_SCAN_INIT -1
443#define NUMA_PTE_SCAN_ACTIVE -2
444
445static inline void mm_init_cpumask(struct mm_struct *mm) 432static inline void mm_init_cpumask(struct mm_struct *mm)
446{ 433{
447#ifdef CONFIG_CPUMASK_OFFSTACK 434#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 93506a114034..da523661500a 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -38,10 +38,10 @@
38 * The last is when there is insufficient space in page->flags and a separate 38 * The last is when there is insufficient space in page->flags and a separate
39 * lookup is necessary. 39 * lookup is necessary.
40 * 40 *
41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | 41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
42 * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | 42 * " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS |
43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | 43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
44 * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | 44 * " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | 45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
46 */ 46 */
47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -62,15 +62,21 @@
62#endif 62#endif
63 63
64#ifdef CONFIG_NUMA_BALANCING 64#ifdef CONFIG_NUMA_BALANCING
65#define LAST_NID_SHIFT NODES_SHIFT 65#define LAST__PID_SHIFT 8
66#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1)
67
68#define LAST__CPU_SHIFT NR_CPUS_BITS
69#define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1)
70
71#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT)
66#else 72#else
67#define LAST_NID_SHIFT 0 73#define LAST_CPUPID_SHIFT 0
68#endif 74#endif
69 75
70#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS 76#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
71#define LAST_NID_WIDTH LAST_NID_SHIFT 77#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
72#else 78#else
73#define LAST_NID_WIDTH 0 79#define LAST_CPUPID_WIDTH 0
74#endif 80#endif
75 81
76/* 82/*
@@ -81,8 +87,8 @@
81#define NODE_NOT_IN_PAGE_FLAGS 87#define NODE_NOT_IN_PAGE_FLAGS
82#endif 88#endif
83 89
84#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 90#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0
85#define LAST_NID_NOT_IN_PAGE_FLAGS 91#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
86#endif 92#endif
87 93
88#endif /* _LINUX_PAGE_FLAGS_LAYOUT */ 94#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f5d4723cdb3d..a3d9dc8c2c00 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,106 +6,95 @@
6 * preempt_count (used for kernel preemption, interrupt count, etc.) 6 * preempt_count (used for kernel preemption, interrupt count, etc.)
7 */ 7 */
8 8
9#include <linux/thread_info.h>
10#include <linux/linkage.h> 9#include <linux/linkage.h>
11#include <linux/list.h> 10#include <linux/list.h>
12 11
13#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) 12/*
14 extern void add_preempt_count(int val); 13 * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
15 extern void sub_preempt_count(int val); 14 * the other bits -- can't include that header due to inclusion hell.
16#else 15 */
17# define add_preempt_count(val) do { preempt_count() += (val); } while (0) 16#define PREEMPT_NEED_RESCHED 0x80000000
18# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0)
19#endif
20
21#define inc_preempt_count() add_preempt_count(1)
22#define dec_preempt_count() sub_preempt_count(1)
23
24#define preempt_count() (current_thread_info()->preempt_count)
25
26#ifdef CONFIG_PREEMPT
27
28asmlinkage void preempt_schedule(void);
29
30#define preempt_check_resched() \
31do { \
32 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
33 preempt_schedule(); \
34} while (0)
35
36#ifdef CONFIG_CONTEXT_TRACKING
37 17
38void preempt_schedule_context(void); 18#include <asm/preempt.h>
39 19
40#define preempt_check_resched_context() \ 20#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
41do { \ 21extern void preempt_count_add(int val);
42 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ 22extern void preempt_count_sub(int val);
43 preempt_schedule_context(); \ 23#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
44} while (0)
45#else 24#else
25#define preempt_count_add(val) __preempt_count_add(val)
26#define preempt_count_sub(val) __preempt_count_sub(val)
27#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
28#endif
46 29
47#define preempt_check_resched_context() preempt_check_resched() 30#define __preempt_count_inc() __preempt_count_add(1)
48 31#define __preempt_count_dec() __preempt_count_sub(1)
49#endif /* CONFIG_CONTEXT_TRACKING */
50
51#else /* !CONFIG_PREEMPT */
52
53#define preempt_check_resched() do { } while (0)
54#define preempt_check_resched_context() do { } while (0)
55
56#endif /* CONFIG_PREEMPT */
57 32
33#define preempt_count_inc() preempt_count_add(1)
34#define preempt_count_dec() preempt_count_sub(1)
58 35
59#ifdef CONFIG_PREEMPT_COUNT 36#ifdef CONFIG_PREEMPT_COUNT
60 37
61#define preempt_disable() \ 38#define preempt_disable() \
62do { \ 39do { \
63 inc_preempt_count(); \ 40 preempt_count_inc(); \
64 barrier(); \ 41 barrier(); \
65} while (0) 42} while (0)
66 43
67#define sched_preempt_enable_no_resched() \ 44#define sched_preempt_enable_no_resched() \
68do { \ 45do { \
69 barrier(); \ 46 barrier(); \
70 dec_preempt_count(); \ 47 preempt_count_dec(); \
71} while (0) 48} while (0)
72 49
73#define preempt_enable_no_resched() sched_preempt_enable_no_resched() 50#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
74 51
52#ifdef CONFIG_PREEMPT
75#define preempt_enable() \ 53#define preempt_enable() \
76do { \ 54do { \
77 preempt_enable_no_resched(); \
78 barrier(); \ 55 barrier(); \
79 preempt_check_resched(); \ 56 if (unlikely(preempt_count_dec_and_test())) \
57 __preempt_schedule(); \
58} while (0)
59
60#define preempt_check_resched() \
61do { \
62 if (should_resched()) \
63 __preempt_schedule(); \
80} while (0) 64} while (0)
81 65
82/* For debugging and tracer internals only! */ 66#else
83#define add_preempt_count_notrace(val) \ 67#define preempt_enable() preempt_enable_no_resched()
84 do { preempt_count() += (val); } while (0) 68#define preempt_check_resched() do { } while (0)
85#define sub_preempt_count_notrace(val) \ 69#endif
86 do { preempt_count() -= (val); } while (0)
87#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
88#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
89 70
90#define preempt_disable_notrace() \ 71#define preempt_disable_notrace() \
91do { \ 72do { \
92 inc_preempt_count_notrace(); \ 73 __preempt_count_inc(); \
93 barrier(); \ 74 barrier(); \
94} while (0) 75} while (0)
95 76
96#define preempt_enable_no_resched_notrace() \ 77#define preempt_enable_no_resched_notrace() \
97do { \ 78do { \
98 barrier(); \ 79 barrier(); \
99 dec_preempt_count_notrace(); \ 80 __preempt_count_dec(); \
100} while (0) 81} while (0)
101 82
102/* preempt_check_resched is OK to trace */ 83#ifdef CONFIG_PREEMPT
84
85#ifndef CONFIG_CONTEXT_TRACKING
86#define __preempt_schedule_context() __preempt_schedule()
87#endif
88
103#define preempt_enable_notrace() \ 89#define preempt_enable_notrace() \
104do { \ 90do { \
105 preempt_enable_no_resched_notrace(); \
106 barrier(); \ 91 barrier(); \
107 preempt_check_resched_context(); \ 92 if (unlikely(__preempt_count_dec_and_test())) \
93 __preempt_schedule_context(); \
108} while (0) 94} while (0)
95#else
96#define preempt_enable_notrace() preempt_enable_no_resched_notrace()
97#endif
109 98
110#else /* !CONFIG_PREEMPT_COUNT */ 99#else /* !CONFIG_PREEMPT_COUNT */
111 100
@@ -115,10 +104,11 @@ do { \
115 * that can cause faults and scheduling migrate into our preempt-protected 104 * that can cause faults and scheduling migrate into our preempt-protected
116 * region. 105 * region.
117 */ 106 */
118#define preempt_disable() barrier() 107#define preempt_disable() barrier()
119#define sched_preempt_enable_no_resched() barrier() 108#define sched_preempt_enable_no_resched() barrier()
120#define preempt_enable_no_resched() barrier() 109#define preempt_enable_no_resched() barrier()
121#define preempt_enable() barrier() 110#define preempt_enable() barrier()
111#define preempt_check_resched() do { } while (0)
122 112
123#define preempt_disable_notrace() barrier() 113#define preempt_disable_notrace() barrier()
124#define preempt_enable_no_resched_notrace() barrier() 114#define preempt_enable_no_resched_notrace() barrier()
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da36b293..833eed55cf43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,6 +22,7 @@ struct sched_param {
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/nodemask.h> 23#include <linux/nodemask.h>
24#include <linux/mm_types.h> 24#include <linux/mm_types.h>
25#include <linux/preempt.h>
25 26
26#include <asm/page.h> 27#include <asm/page.h>
27#include <asm/ptrace.h> 28#include <asm/ptrace.h>
@@ -427,6 +428,14 @@ struct task_cputime {
427 .sum_exec_runtime = 0, \ 428 .sum_exec_runtime = 0, \
428 } 429 }
429 430
431#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED)
432
433#ifdef CONFIG_PREEMPT_COUNT
434#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
435#else
436#define PREEMPT_DISABLED PREEMPT_ENABLED
437#endif
438
430/* 439/*
431 * Disable preemption until the scheduler is running. 440 * Disable preemption until the scheduler is running.
432 * Reset by start_kernel()->sched_init()->init_idle(). 441 * Reset by start_kernel()->sched_init()->init_idle().
@@ -434,7 +443,7 @@ struct task_cputime {
434 * We include PREEMPT_ACTIVE to avoid cond_resched() from working 443 * We include PREEMPT_ACTIVE to avoid cond_resched() from working
435 * before the scheduler is active -- see should_resched(). 444 * before the scheduler is active -- see should_resched().
436 */ 445 */
437#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) 446#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
438 447
439/** 448/**
440 * struct thread_group_cputimer - thread group interval timer counts 449 * struct thread_group_cputimer - thread group interval timer counts
@@ -768,6 +777,7 @@ enum cpu_idle_type {
768#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 777#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
769#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 778#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
770#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 779#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
780#define SD_NUMA 0x4000 /* cross-node balancing */
771 781
772extern int __weak arch_sd_sibiling_asym_packing(void); 782extern int __weak arch_sd_sibiling_asym_packing(void);
773 783
@@ -811,6 +821,10 @@ struct sched_domain {
811 821
812 u64 last_update; 822 u64 last_update;
813 823
824 /* idle_balance() stats */
825 u64 max_newidle_lb_cost;
826 unsigned long next_decay_max_lb_cost;
827
814#ifdef CONFIG_SCHEDSTATS 828#ifdef CONFIG_SCHEDSTATS
815 /* load_balance() stats */ 829 /* load_balance() stats */
816 unsigned int lb_count[CPU_MAX_IDLE_TYPES]; 830 unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1029,6 +1043,8 @@ struct task_struct {
1029 struct task_struct *last_wakee; 1043 struct task_struct *last_wakee;
1030 unsigned long wakee_flips; 1044 unsigned long wakee_flips;
1031 unsigned long wakee_flip_decay_ts; 1045 unsigned long wakee_flip_decay_ts;
1046
1047 int wake_cpu;
1032#endif 1048#endif
1033 int on_rq; 1049 int on_rq;
1034 1050
@@ -1324,10 +1340,41 @@ struct task_struct {
1324#endif 1340#endif
1325#ifdef CONFIG_NUMA_BALANCING 1341#ifdef CONFIG_NUMA_BALANCING
1326 int numa_scan_seq; 1342 int numa_scan_seq;
1327 int numa_migrate_seq;
1328 unsigned int numa_scan_period; 1343 unsigned int numa_scan_period;
1344 unsigned int numa_scan_period_max;
1345 int numa_preferred_nid;
1346 int numa_migrate_deferred;
1347 unsigned long numa_migrate_retry;
1329 u64 node_stamp; /* migration stamp */ 1348 u64 node_stamp; /* migration stamp */
1330 struct callback_head numa_work; 1349 struct callback_head numa_work;
1350
1351 struct list_head numa_entry;
1352 struct numa_group *numa_group;
1353
1354 /*
1355 * Exponential decaying average of faults on a per-node basis.
1356 * Scheduling placement decisions are made based on the these counts.
1357 * The values remain static for the duration of a PTE scan
1358 */
1359 unsigned long *numa_faults;
1360 unsigned long total_numa_faults;
1361
1362 /*
1363 * numa_faults_buffer records faults per node during the current
1364 * scan window. When the scan completes, the counts in numa_faults
1365 * decay and these values are copied.
1366 */
1367 unsigned long *numa_faults_buffer;
1368
1369 /*
1370 * numa_faults_locality tracks if faults recorded during the last
1371 * scan window were remote/local. The task scan period is adapted
1372 * based on the locality of the faults with different weights
1373 * depending on whether they were shared or private faults
1374 */
1375 unsigned long numa_faults_locality[2];
1376
1377 unsigned long numa_pages_migrated;
1331#endif /* CONFIG_NUMA_BALANCING */ 1378#endif /* CONFIG_NUMA_BALANCING */
1332 1379
1333 struct rcu_head rcu; 1380 struct rcu_head rcu;
@@ -1413,16 +1460,33 @@ struct task_struct {
1413/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1460/* Future-safe accessor for struct task_struct's cpus_allowed. */
1414#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1461#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1415 1462
1463#define TNF_MIGRATED 0x01
1464#define TNF_NO_GROUP 0x02
1465#define TNF_SHARED 0x04
1466#define TNF_FAULT_LOCAL 0x08
1467
1416#ifdef CONFIG_NUMA_BALANCING 1468#ifdef CONFIG_NUMA_BALANCING
1417extern void task_numa_fault(int node, int pages, bool migrated); 1469extern void task_numa_fault(int last_node, int node, int pages, int flags);
1470extern pid_t task_numa_group_id(struct task_struct *p);
1418extern void set_numabalancing_state(bool enabled); 1471extern void set_numabalancing_state(bool enabled);
1472extern void task_numa_free(struct task_struct *p);
1473
1474extern unsigned int sysctl_numa_balancing_migrate_deferred;
1419#else 1475#else
1420static inline void task_numa_fault(int node, int pages, bool migrated) 1476static inline void task_numa_fault(int last_node, int node, int pages,
1477 int flags)
1421{ 1478{
1422} 1479}
1480static inline pid_t task_numa_group_id(struct task_struct *p)
1481{
1482 return 0;
1483}
1423static inline void set_numabalancing_state(bool enabled) 1484static inline void set_numabalancing_state(bool enabled)
1424{ 1485{
1425} 1486}
1487static inline void task_numa_free(struct task_struct *p)
1488{
1489}
1426#endif 1490#endif
1427 1491
1428static inline struct pid *task_pid(struct task_struct *task) 1492static inline struct pid *task_pid(struct task_struct *task)
@@ -1975,7 +2039,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
1975#else 2039#else
1976 static inline void kick_process(struct task_struct *tsk) { } 2040 static inline void kick_process(struct task_struct *tsk) { }
1977#endif 2041#endif
1978extern void sched_fork(struct task_struct *p); 2042extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
1979extern void sched_dead(struct task_struct *p); 2043extern void sched_dead(struct task_struct *p);
1980 2044
1981extern void proc_caches_init(void); 2045extern void proc_caches_init(void);
@@ -2402,11 +2466,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
2402 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); 2466 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
2403} 2467}
2404 2468
2405static inline int need_resched(void)
2406{
2407 return unlikely(test_thread_flag(TIF_NEED_RESCHED));
2408}
2409
2410/* 2469/*
2411 * cond_resched() and cond_resched_lock(): latency reduction via 2470 * cond_resched() and cond_resched_lock(): latency reduction via
2412 * explicit rescheduling in places that are safe. The return 2471 * explicit rescheduling in places that are safe. The return
@@ -2475,36 +2534,105 @@ static inline int tsk_is_polling(struct task_struct *p)
2475{ 2534{
2476 return task_thread_info(p)->status & TS_POLLING; 2535 return task_thread_info(p)->status & TS_POLLING;
2477} 2536}
2478static inline void current_set_polling(void) 2537static inline void __current_set_polling(void)
2479{ 2538{
2480 current_thread_info()->status |= TS_POLLING; 2539 current_thread_info()->status |= TS_POLLING;
2481} 2540}
2482 2541
2483static inline void current_clr_polling(void) 2542static inline bool __must_check current_set_polling_and_test(void)
2543{
2544 __current_set_polling();
2545
2546 /*
2547 * Polling state must be visible before we test NEED_RESCHED,
2548 * paired by resched_task()
2549 */
2550 smp_mb();
2551
2552 return unlikely(tif_need_resched());
2553}
2554
2555static inline void __current_clr_polling(void)
2484{ 2556{
2485 current_thread_info()->status &= ~TS_POLLING; 2557 current_thread_info()->status &= ~TS_POLLING;
2486 smp_mb__after_clear_bit(); 2558}
2559
2560static inline bool __must_check current_clr_polling_and_test(void)
2561{
2562 __current_clr_polling();
2563
2564 /*
2565 * Polling state must be visible before we test NEED_RESCHED,
2566 * paired by resched_task()
2567 */
2568 smp_mb();
2569
2570 return unlikely(tif_need_resched());
2487} 2571}
2488#elif defined(TIF_POLLING_NRFLAG) 2572#elif defined(TIF_POLLING_NRFLAG)
2489static inline int tsk_is_polling(struct task_struct *p) 2573static inline int tsk_is_polling(struct task_struct *p)
2490{ 2574{
2491 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); 2575 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
2492} 2576}
2493static inline void current_set_polling(void) 2577
2578static inline void __current_set_polling(void)
2494{ 2579{
2495 set_thread_flag(TIF_POLLING_NRFLAG); 2580 set_thread_flag(TIF_POLLING_NRFLAG);
2496} 2581}
2497 2582
2498static inline void current_clr_polling(void) 2583static inline bool __must_check current_set_polling_and_test(void)
2584{
2585 __current_set_polling();
2586
2587 /*
2588 * Polling state must be visible before we test NEED_RESCHED,
2589 * paired by resched_task()
2590 *
2591 * XXX: assumes set/clear bit are identical barrier wise.
2592 */
2593 smp_mb__after_clear_bit();
2594
2595 return unlikely(tif_need_resched());
2596}
2597
2598static inline void __current_clr_polling(void)
2499{ 2599{
2500 clear_thread_flag(TIF_POLLING_NRFLAG); 2600 clear_thread_flag(TIF_POLLING_NRFLAG);
2501} 2601}
2602
2603static inline bool __must_check current_clr_polling_and_test(void)
2604{
2605 __current_clr_polling();
2606
2607 /*
2608 * Polling state must be visible before we test NEED_RESCHED,
2609 * paired by resched_task()
2610 */
2611 smp_mb__after_clear_bit();
2612
2613 return unlikely(tif_need_resched());
2614}
2615
2502#else 2616#else
2503static inline int tsk_is_polling(struct task_struct *p) { return 0; } 2617static inline int tsk_is_polling(struct task_struct *p) { return 0; }
2504static inline void current_set_polling(void) { } 2618static inline void __current_set_polling(void) { }
2505static inline void current_clr_polling(void) { } 2619static inline void __current_clr_polling(void) { }
2620
2621static inline bool __must_check current_set_polling_and_test(void)
2622{
2623 return unlikely(tif_need_resched());
2624}
2625static inline bool __must_check current_clr_polling_and_test(void)
2626{
2627 return unlikely(tif_need_resched());
2628}
2506#endif 2629#endif
2507 2630
2631static __always_inline bool need_resched(void)
2632{
2633 return unlikely(tif_need_resched());
2634}
2635
2508/* 2636/*
2509 * Thread group CPU time accounting. 2637 * Thread group CPU time accounting.
2510 */ 2638 */
@@ -2546,6 +2674,11 @@ static inline unsigned int task_cpu(const struct task_struct *p)
2546 return task_thread_info(p)->cpu; 2674 return task_thread_info(p)->cpu;
2547} 2675}
2548 2676
2677static inline int task_node(const struct task_struct *p)
2678{
2679 return cpu_to_node(task_cpu(p));
2680}
2681
2549extern void set_task_cpu(struct task_struct *p, unsigned int cpu); 2682extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
2550 2683
2551#else 2684#else
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index bf8086b2506e..10d16c4fbe89 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
47extern unsigned int sysctl_numa_balancing_scan_delay; 47extern unsigned int sysctl_numa_balancing_scan_delay;
48extern unsigned int sysctl_numa_balancing_scan_period_min; 48extern unsigned int sysctl_numa_balancing_scan_period_min;
49extern unsigned int sysctl_numa_balancing_scan_period_max; 49extern unsigned int sysctl_numa_balancing_scan_period_max;
50extern unsigned int sysctl_numa_balancing_scan_period_reset;
51extern unsigned int sysctl_numa_balancing_scan_size; 50extern unsigned int sysctl_numa_balancing_scan_size;
52extern unsigned int sysctl_numa_balancing_settle_count; 51extern unsigned int sysctl_numa_balancing_settle_count;
53 52
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3b5e910d14ca..d2abbdb8c6aa 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -28,6 +28,7 @@ struct cpu_stop_work {
28}; 28};
29 29
30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); 30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
31int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
31void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, 32void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
32 struct cpu_stop_work *work_buf); 33 struct cpu_stop_work *work_buf);
33int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); 34int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e7e04736802f..fddbe2023a5d 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -104,8 +104,21 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
104#define test_thread_flag(flag) \ 104#define test_thread_flag(flag) \
105 test_ti_thread_flag(current_thread_info(), flag) 105 test_ti_thread_flag(current_thread_info(), flag)
106 106
107#define set_need_resched() set_thread_flag(TIF_NEED_RESCHED) 107static inline __deprecated void set_need_resched(void)
108#define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED) 108{
109 /*
110 * Use of this function in deprecated.
111 *
112 * As of this writing there are only a few users in the DRM tree left
113 * all of which are wrong and can be removed without causing too much
114 * grief.
115 *
116 * The DRM people are aware and are working on removing the last few
117 * instances.
118 */
119}
120
121#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
109 122
110#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK 123#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
111/* 124/*
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d3cf0d6e7712..12ae6ce997d6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -106,6 +106,8 @@ int arch_update_cpu_topology(void);
106 .last_balance = jiffies, \ 106 .last_balance = jiffies, \
107 .balance_interval = 1, \ 107 .balance_interval = 1, \
108 .smt_gain = 1178, /* 15% */ \ 108 .smt_gain = 1178, /* 15% */ \
109 .max_newidle_lb_cost = 0, \
110 .next_decay_max_lb_cost = jiffies, \
109} 111}
110#endif 112#endif
111#endif /* CONFIG_SCHED_SMT */ 113#endif /* CONFIG_SCHED_SMT */
@@ -135,6 +137,8 @@ int arch_update_cpu_topology(void);
135 , \ 137 , \
136 .last_balance = jiffies, \ 138 .last_balance = jiffies, \
137 .balance_interval = 1, \ 139 .balance_interval = 1, \
140 .max_newidle_lb_cost = 0, \
141 .next_decay_max_lb_cost = jiffies, \
138} 142}
139#endif 143#endif
140#endif /* CONFIG_SCHED_MC */ 144#endif /* CONFIG_SCHED_MC */
@@ -166,6 +170,8 @@ int arch_update_cpu_topology(void);
166 , \ 170 , \
167 .last_balance = jiffies, \ 171 .last_balance = jiffies, \
168 .balance_interval = 1, \ 172 .balance_interval = 1, \
173 .max_newidle_lb_cost = 0, \
174 .next_decay_max_lb_cost = jiffies, \
169} 175}
170#endif 176#endif
171 177
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 64f864651d86..633cac77f9f9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -672,31 +672,17 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty,
672#define wait_event_interruptible_tty(tty, wq, condition) \ 672#define wait_event_interruptible_tty(tty, wq, condition) \
673({ \ 673({ \
674 int __ret = 0; \ 674 int __ret = 0; \
675 if (!(condition)) { \ 675 if (!(condition)) \
676 __wait_event_interruptible_tty(tty, wq, condition, __ret); \ 676 __ret = __wait_event_interruptible_tty(tty, wq, \
677 } \ 677 condition); \
678 __ret; \ 678 __ret; \
679}) 679})
680 680
681#define __wait_event_interruptible_tty(tty, wq, condition, ret) \ 681#define __wait_event_interruptible_tty(tty, wq, condition) \
682do { \ 682 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
683 DEFINE_WAIT(__wait); \ 683 tty_unlock(tty); \
684 \
685 for (;;) { \
686 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
687 if (condition) \
688 break; \
689 if (!signal_pending(current)) { \
690 tty_unlock(tty); \
691 schedule(); \ 684 schedule(); \
692 tty_lock(tty); \ 685 tty_lock(tty))
693 continue; \
694 } \
695 ret = -ERESTARTSYS; \
696 break; \
697 } \
698 finish_wait(&wq, &__wait); \
699} while (0)
700 686
701#ifdef CONFIG_PROC_FS 687#ifdef CONFIG_PROC_FS
702extern void proc_tty_register_driver(struct tty_driver *); 688extern void proc_tty_register_driver(struct tty_driver *);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5ca0951e1855..9d8cf056e661 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -15,7 +15,7 @@
15 */ 15 */
16static inline void pagefault_disable(void) 16static inline void pagefault_disable(void)
17{ 17{
18 inc_preempt_count(); 18 preempt_count_inc();
19 /* 19 /*
20 * make sure to have issued the store before a pagefault 20 * make sure to have issued the store before a pagefault
21 * can hit. 21 * can hit.
@@ -30,11 +30,7 @@ static inline void pagefault_enable(void)
30 * the pagefault handler again. 30 * the pagefault handler again.
31 */ 31 */
32 barrier(); 32 barrier();
33 dec_preempt_count(); 33 preempt_count_dec();
34 /*
35 * make sure we do..
36 */
37 barrier();
38 preempt_check_resched(); 34 preempt_check_resched();
39} 35}
40 36
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a67fc1635592..a2726c7dd244 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1,7 +1,8 @@
1#ifndef _LINUX_WAIT_H 1#ifndef _LINUX_WAIT_H
2#define _LINUX_WAIT_H 2#define _LINUX_WAIT_H
3 3/*
4 4 * Linux wait queue related types and methods
5 */
5#include <linux/list.h> 6#include <linux/list.h>
6#include <linux/stddef.h> 7#include <linux/stddef.h>
7#include <linux/spinlock.h> 8#include <linux/spinlock.h>
@@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
13int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); 14int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
14 15
15struct __wait_queue { 16struct __wait_queue {
16 unsigned int flags; 17 unsigned int flags;
17#define WQ_FLAG_EXCLUSIVE 0x01 18#define WQ_FLAG_EXCLUSIVE 0x01
18 void *private; 19 void *private;
19 wait_queue_func_t func; 20 wait_queue_func_t func;
20 struct list_head task_list; 21 struct list_head task_list;
21}; 22};
22 23
23struct wait_bit_key { 24struct wait_bit_key {
24 void *flags; 25 void *flags;
25 int bit_nr; 26 int bit_nr;
26#define WAIT_ATOMIC_T_BIT_NR -1 27#define WAIT_ATOMIC_T_BIT_NR -1
27}; 28};
28 29
29struct wait_bit_queue { 30struct wait_bit_queue {
30 struct wait_bit_key key; 31 struct wait_bit_key key;
31 wait_queue_t wait; 32 wait_queue_t wait;
32}; 33};
33 34
34struct __wait_queue_head { 35struct __wait_queue_head {
35 spinlock_t lock; 36 spinlock_t lock;
36 struct list_head task_list; 37 struct list_head task_list;
37}; 38};
38typedef struct __wait_queue_head wait_queue_head_t; 39typedef struct __wait_queue_head wait_queue_head_t;
39 40
@@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
84 85
85static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) 86static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
86{ 87{
87 q->flags = 0; 88 q->flags = 0;
88 q->private = p; 89 q->private = p;
89 q->func = default_wake_function; 90 q->func = default_wake_function;
90} 91}
91 92
92static inline void init_waitqueue_func_entry(wait_queue_t *q, 93static inline void
93 wait_queue_func_t func) 94init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
94{ 95{
95 q->flags = 0; 96 q->flags = 0;
96 q->private = NULL; 97 q->private = NULL;
97 q->func = func; 98 q->func = func;
98} 99}
99 100
100static inline int waitqueue_active(wait_queue_head_t *q) 101static inline int waitqueue_active(wait_queue_head_t *q)
@@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
114/* 115/*
115 * Used for wake-one threads: 116 * Used for wake-one threads:
116 */ 117 */
117static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, 118static inline void
118 wait_queue_t *wait) 119__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
119{ 120{
120 wait->flags |= WQ_FLAG_EXCLUSIVE; 121 wait->flags |= WQ_FLAG_EXCLUSIVE;
121 __add_wait_queue(q, wait); 122 __add_wait_queue(q, wait);
@@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head,
127 list_add_tail(&new->task_list, &head->task_list); 128 list_add_tail(&new->task_list, &head->task_list);
128} 129}
129 130
130static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, 131static inline void
131 wait_queue_t *wait) 132__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
132{ 133{
133 wait->flags |= WQ_FLAG_EXCLUSIVE; 134 wait->flags |= WQ_FLAG_EXCLUSIVE;
134 __add_wait_queue_tail(q, wait); 135 __add_wait_queue_tail(q, wait);
135} 136}
136 137
137static inline void __remove_wait_queue(wait_queue_head_t *head, 138static inline void
138 wait_queue_t *old) 139__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
139{ 140{
140 list_del(&old->task_list); 141 list_del(&old->task_list);
141} 142}
142 143
143void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 144void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
144void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); 145void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
145void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, 146void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
146 void *key);
147void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); 147void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
148void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); 148void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
149void __wake_up_bit(wait_queue_head_t *, void *, int); 149void __wake_up_bit(wait_queue_head_t *, void *, int);
@@ -170,27 +170,61 @@ wait_queue_head_t *bit_waitqueue(void *, int);
170/* 170/*
171 * Wakeup macros to be used to report events to the targets. 171 * Wakeup macros to be used to report events to the targets.
172 */ 172 */
173#define wake_up_poll(x, m) \ 173#define wake_up_poll(x, m) \
174 __wake_up(x, TASK_NORMAL, 1, (void *) (m)) 174 __wake_up(x, TASK_NORMAL, 1, (void *) (m))
175#define wake_up_locked_poll(x, m) \ 175#define wake_up_locked_poll(x, m) \
176 __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) 176 __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
177#define wake_up_interruptible_poll(x, m) \ 177#define wake_up_interruptible_poll(x, m) \
178 __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) 178 __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
179#define wake_up_interruptible_sync_poll(x, m) \ 179#define wake_up_interruptible_sync_poll(x, m) \
180 __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) 180 __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
181 181
182#define __wait_event(wq, condition) \ 182#define ___wait_cond_timeout(condition) \
183do { \ 183({ \
184 bool __cond = (condition); \
185 if (__cond && !__ret) \
186 __ret = 1; \
187 __cond || !__ret; \
188})
189
190#define ___wait_signal_pending(state) \
191 ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \
192 (state == TASK_KILLABLE && fatal_signal_pending(current)))
193
194#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \
195({ \
196 __label__ __out; \
184 DEFINE_WAIT(__wait); \ 197 DEFINE_WAIT(__wait); \
198 long __ret = ret; \
185 \ 199 \
186 for (;;) { \ 200 for (;;) { \
187 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ 201 if (exclusive) \
202 prepare_to_wait_exclusive(&wq, &__wait, state); \
203 else \
204 prepare_to_wait(&wq, &__wait, state); \
205 \
188 if (condition) \ 206 if (condition) \
189 break; \ 207 break; \
190 schedule(); \ 208 \
209 if (___wait_signal_pending(state)) { \
210 __ret = -ERESTARTSYS; \
211 if (exclusive) { \
212 abort_exclusive_wait(&wq, &__wait, \
213 state, NULL); \
214 goto __out; \
215 } \
216 break; \
217 } \
218 \
219 cmd; \
191 } \ 220 } \
192 finish_wait(&wq, &__wait); \ 221 finish_wait(&wq, &__wait); \
193} while (0) 222__out: __ret; \
223})
224
225#define __wait_event(wq, condition) \
226 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
227 schedule())
194 228
195/** 229/**
196 * wait_event - sleep until a condition gets true 230 * wait_event - sleep until a condition gets true
@@ -204,29 +238,17 @@ do { \
204 * wake_up() has to be called after changing any variable that could 238 * wake_up() has to be called after changing any variable that could
205 * change the result of the wait condition. 239 * change the result of the wait condition.
206 */ 240 */
207#define wait_event(wq, condition) \ 241#define wait_event(wq, condition) \
208do { \ 242do { \
209 if (condition) \ 243 if (condition) \
210 break; \ 244 break; \
211 __wait_event(wq, condition); \ 245 __wait_event(wq, condition); \
212} while (0) 246} while (0)
213 247
214#define __wait_event_timeout(wq, condition, ret) \ 248#define __wait_event_timeout(wq, condition, timeout) \
215do { \ 249 ___wait_event(wq, ___wait_cond_timeout(condition), \
216 DEFINE_WAIT(__wait); \ 250 TASK_UNINTERRUPTIBLE, 0, timeout, \
217 \ 251 __ret = schedule_timeout(__ret))
218 for (;;) { \
219 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
220 if (condition) \
221 break; \
222 ret = schedule_timeout(ret); \
223 if (!ret) \
224 break; \
225 } \
226 if (!ret && (condition)) \
227 ret = 1; \
228 finish_wait(&wq, &__wait); \
229} while (0)
230 252
231/** 253/**
232 * wait_event_timeout - sleep until a condition gets true or a timeout elapses 254 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
@@ -248,28 +270,14 @@ do { \
248#define wait_event_timeout(wq, condition, timeout) \ 270#define wait_event_timeout(wq, condition, timeout) \
249({ \ 271({ \
250 long __ret = timeout; \ 272 long __ret = timeout; \
251 if (!(condition)) \ 273 if (!(condition)) \
252 __wait_event_timeout(wq, condition, __ret); \ 274 __ret = __wait_event_timeout(wq, condition, timeout); \
253 __ret; \ 275 __ret; \
254}) 276})
255 277
256#define __wait_event_interruptible(wq, condition, ret) \ 278#define __wait_event_interruptible(wq, condition) \
257do { \ 279 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
258 DEFINE_WAIT(__wait); \ 280 schedule())
259 \
260 for (;;) { \
261 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
262 if (condition) \
263 break; \
264 if (!signal_pending(current)) { \
265 schedule(); \
266 continue; \
267 } \
268 ret = -ERESTARTSYS; \
269 break; \
270 } \
271 finish_wait(&wq, &__wait); \
272} while (0)
273 281
274/** 282/**
275 * wait_event_interruptible - sleep until a condition gets true 283 * wait_event_interruptible - sleep until a condition gets true
@@ -290,31 +298,14 @@ do { \
290({ \ 298({ \
291 int __ret = 0; \ 299 int __ret = 0; \
292 if (!(condition)) \ 300 if (!(condition)) \
293 __wait_event_interruptible(wq, condition, __ret); \ 301 __ret = __wait_event_interruptible(wq, condition); \
294 __ret; \ 302 __ret; \
295}) 303})
296 304
297#define __wait_event_interruptible_timeout(wq, condition, ret) \ 305#define __wait_event_interruptible_timeout(wq, condition, timeout) \
298do { \ 306 ___wait_event(wq, ___wait_cond_timeout(condition), \
299 DEFINE_WAIT(__wait); \ 307 TASK_INTERRUPTIBLE, 0, timeout, \
300 \ 308 __ret = schedule_timeout(__ret))
301 for (;;) { \
302 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
303 if (condition) \
304 break; \
305 if (!signal_pending(current)) { \
306 ret = schedule_timeout(ret); \
307 if (!ret) \
308 break; \
309 continue; \
310 } \
311 ret = -ERESTARTSYS; \
312 break; \
313 } \
314 if (!ret && (condition)) \
315 ret = 1; \
316 finish_wait(&wq, &__wait); \
317} while (0)
318 309
319/** 310/**
320 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses 311 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
@@ -338,14 +329,14 @@ do { \
338({ \ 329({ \
339 long __ret = timeout; \ 330 long __ret = timeout; \
340 if (!(condition)) \ 331 if (!(condition)) \
341 __wait_event_interruptible_timeout(wq, condition, __ret); \ 332 __ret = __wait_event_interruptible_timeout(wq, \
333 condition, timeout); \
342 __ret; \ 334 __ret; \
343}) 335})
344 336
345#define __wait_event_hrtimeout(wq, condition, timeout, state) \ 337#define __wait_event_hrtimeout(wq, condition, timeout, state) \
346({ \ 338({ \
347 int __ret = 0; \ 339 int __ret = 0; \
348 DEFINE_WAIT(__wait); \
349 struct hrtimer_sleeper __t; \ 340 struct hrtimer_sleeper __t; \
350 \ 341 \
351 hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ 342 hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \
@@ -356,25 +347,15 @@ do { \
356 current->timer_slack_ns, \ 347 current->timer_slack_ns, \
357 HRTIMER_MODE_REL); \ 348 HRTIMER_MODE_REL); \
358 \ 349 \
359 for (;;) { \ 350 __ret = ___wait_event(wq, condition, state, 0, 0, \
360 prepare_to_wait(&wq, &__wait, state); \
361 if (condition) \
362 break; \
363 if (state == TASK_INTERRUPTIBLE && \
364 signal_pending(current)) { \
365 __ret = -ERESTARTSYS; \
366 break; \
367 } \
368 if (!__t.task) { \ 351 if (!__t.task) { \
369 __ret = -ETIME; \ 352 __ret = -ETIME; \
370 break; \ 353 break; \
371 } \ 354 } \
372 schedule(); \ 355 schedule()); \
373 } \
374 \ 356 \
375 hrtimer_cancel(&__t.timer); \ 357 hrtimer_cancel(&__t.timer); \
376 destroy_hrtimer_on_stack(&__t.timer); \ 358 destroy_hrtimer_on_stack(&__t.timer); \
377 finish_wait(&wq, &__wait); \
378 __ret; \ 359 __ret; \
379}) 360})
380 361
@@ -428,33 +409,15 @@ do { \
428 __ret; \ 409 __ret; \
429}) 410})
430 411
431#define __wait_event_interruptible_exclusive(wq, condition, ret) \ 412#define __wait_event_interruptible_exclusive(wq, condition) \
432do { \ 413 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
433 DEFINE_WAIT(__wait); \ 414 schedule())
434 \
435 for (;;) { \
436 prepare_to_wait_exclusive(&wq, &__wait, \
437 TASK_INTERRUPTIBLE); \
438 if (condition) { \
439 finish_wait(&wq, &__wait); \
440 break; \
441 } \
442 if (!signal_pending(current)) { \
443 schedule(); \
444 continue; \
445 } \
446 ret = -ERESTARTSYS; \
447 abort_exclusive_wait(&wq, &__wait, \
448 TASK_INTERRUPTIBLE, NULL); \
449 break; \
450 } \
451} while (0)
452 415
453#define wait_event_interruptible_exclusive(wq, condition) \ 416#define wait_event_interruptible_exclusive(wq, condition) \
454({ \ 417({ \
455 int __ret = 0; \ 418 int __ret = 0; \
456 if (!(condition)) \ 419 if (!(condition)) \
457 __wait_event_interruptible_exclusive(wq, condition, __ret);\ 420 __ret = __wait_event_interruptible_exclusive(wq, condition);\
458 __ret; \ 421 __ret; \
459}) 422})
460 423
@@ -606,24 +569,8 @@ do { \
606 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) 569 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
607 570
608 571
609 572#define __wait_event_killable(wq, condition) \
610#define __wait_event_killable(wq, condition, ret) \ 573 ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
611do { \
612 DEFINE_WAIT(__wait); \
613 \
614 for (;;) { \
615 prepare_to_wait(&wq, &__wait, TASK_KILLABLE); \
616 if (condition) \
617 break; \
618 if (!fatal_signal_pending(current)) { \
619 schedule(); \
620 continue; \
621 } \
622 ret = -ERESTARTSYS; \
623 break; \
624 } \
625 finish_wait(&wq, &__wait); \
626} while (0)
627 574
628/** 575/**
629 * wait_event_killable - sleep until a condition gets true 576 * wait_event_killable - sleep until a condition gets true
@@ -644,26 +591,17 @@ do { \
644({ \ 591({ \
645 int __ret = 0; \ 592 int __ret = 0; \
646 if (!(condition)) \ 593 if (!(condition)) \
647 __wait_event_killable(wq, condition, __ret); \ 594 __ret = __wait_event_killable(wq, condition); \
648 __ret; \ 595 __ret; \
649}) 596})
650 597
651 598
652#define __wait_event_lock_irq(wq, condition, lock, cmd) \ 599#define __wait_event_lock_irq(wq, condition, lock, cmd) \
653do { \ 600 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
654 DEFINE_WAIT(__wait); \ 601 spin_unlock_irq(&lock); \
655 \ 602 cmd; \
656 for (;;) { \ 603 schedule(); \
657 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ 604 spin_lock_irq(&lock))
658 if (condition) \
659 break; \
660 spin_unlock_irq(&lock); \
661 cmd; \
662 schedule(); \
663 spin_lock_irq(&lock); \
664 } \
665 finish_wait(&wq, &__wait); \
666} while (0)
667 605
668/** 606/**
669 * wait_event_lock_irq_cmd - sleep until a condition gets true. The 607 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
@@ -723,26 +661,12 @@ do { \
723} while (0) 661} while (0)
724 662
725 663
726#define __wait_event_interruptible_lock_irq(wq, condition, \ 664#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \
727 lock, ret, cmd) \ 665 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
728do { \ 666 spin_unlock_irq(&lock); \
729 DEFINE_WAIT(__wait); \ 667 cmd; \
730 \ 668 schedule(); \
731 for (;;) { \ 669 spin_lock_irq(&lock))
732 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
733 if (condition) \
734 break; \
735 if (signal_pending(current)) { \
736 ret = -ERESTARTSYS; \
737 break; \
738 } \
739 spin_unlock_irq(&lock); \
740 cmd; \
741 schedule(); \
742 spin_lock_irq(&lock); \
743 } \
744 finish_wait(&wq, &__wait); \
745} while (0)
746 670
747/** 671/**
748 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. 672 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
@@ -772,10 +696,9 @@ do { \
772#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ 696#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \
773({ \ 697({ \
774 int __ret = 0; \ 698 int __ret = 0; \
775 \
776 if (!(condition)) \ 699 if (!(condition)) \
777 __wait_event_interruptible_lock_irq(wq, condition, \ 700 __ret = __wait_event_interruptible_lock_irq(wq, \
778 lock, __ret, cmd); \ 701 condition, lock, cmd); \
779 __ret; \ 702 __ret; \
780}) 703})
781 704
@@ -804,39 +727,24 @@ do { \
804#define wait_event_interruptible_lock_irq(wq, condition, lock) \ 727#define wait_event_interruptible_lock_irq(wq, condition, lock) \
805({ \ 728({ \
806 int __ret = 0; \ 729 int __ret = 0; \
807 \
808 if (!(condition)) \ 730 if (!(condition)) \
809 __wait_event_interruptible_lock_irq(wq, condition, \ 731 __ret = __wait_event_interruptible_lock_irq(wq, \
810 lock, __ret, ); \ 732 condition, lock,) \
811 __ret; \ 733 __ret; \
812}) 734})
813 735
814#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ 736#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \
815 lock, ret) \ 737 lock, timeout) \
816do { \ 738 ___wait_event(wq, ___wait_cond_timeout(condition), \
817 DEFINE_WAIT(__wait); \ 739 TASK_INTERRUPTIBLE, 0, ret, \
818 \ 740 spin_unlock_irq(&lock); \
819 for (;;) { \ 741 __ret = schedule_timeout(__ret); \
820 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ 742 spin_lock_irq(&lock));
821 if (condition) \
822 break; \
823 if (signal_pending(current)) { \
824 ret = -ERESTARTSYS; \
825 break; \
826 } \
827 spin_unlock_irq(&lock); \
828 ret = schedule_timeout(ret); \
829 spin_lock_irq(&lock); \
830 if (!ret) \
831 break; \
832 } \
833 finish_wait(&wq, &__wait); \
834} while (0)
835 743
836/** 744/**
837 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. 745 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
838 * The condition is checked under the lock. This is expected 746 * true or a timeout elapses. The condition is checked under
839 * to be called with the lock taken. 747 * the lock. This is expected to be called with the lock taken.
840 * @wq: the waitqueue to wait on 748 * @wq: the waitqueue to wait on
841 * @condition: a C expression for the event to wait for 749 * @condition: a C expression for the event to wait for
842 * @lock: a locked spinlock_t, which will be released before schedule() 750 * @lock: a locked spinlock_t, which will be released before schedule()
@@ -860,11 +768,10 @@ do { \
860#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ 768#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \
861 timeout) \ 769 timeout) \
862({ \ 770({ \
863 int __ret = timeout; \ 771 long __ret = timeout; \
864 \
865 if (!(condition)) \ 772 if (!(condition)) \
866 __wait_event_interruptible_lock_irq_timeout( \ 773 __ret = __wait_event_interruptible_lock_irq_timeout( \
867 wq, condition, lock, __ret); \ 774 wq, condition, lock, timeout); \
868 __ret; \ 775 __ret; \
869}) 776})
870 777
@@ -875,11 +782,9 @@ do { \
875 * We plan to remove these interfaces. 782 * We plan to remove these interfaces.
876 */ 783 */
877extern void sleep_on(wait_queue_head_t *q); 784extern void sleep_on(wait_queue_head_t *q);
878extern long sleep_on_timeout(wait_queue_head_t *q, 785extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
879 signed long timeout);
880extern void interruptible_sleep_on(wait_queue_head_t *q); 786extern void interruptible_sleep_on(wait_queue_head_t *q);
881extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, 787extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
882 signed long timeout);
883 788
884/* 789/*
885 * Waitqueues which are removed from the waitqueue_head at wakeup time 790 * Waitqueues which are removed from the waitqueue_head at wakeup time
@@ -887,8 +792,7 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
887void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); 792void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
888void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); 793void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
889void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); 794void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
890void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 795void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
891 unsigned int mode, void *key);
892int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 796int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
893int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 797int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
894 798
@@ -934,8 +838,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
934 * One uses wait_on_bit() where one is waiting for the bit to clear, 838 * One uses wait_on_bit() where one is waiting for the bit to clear,
935 * but has no intention of setting it. 839 * but has no intention of setting it.
936 */ 840 */
937static inline int wait_on_bit(void *word, int bit, 841static inline int
938 int (*action)(void *), unsigned mode) 842wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode)
939{ 843{
940 if (!test_bit(bit, word)) 844 if (!test_bit(bit, word))
941 return 0; 845 return 0;
@@ -958,8 +862,8 @@ static inline int wait_on_bit(void *word, int bit,
958 * One uses wait_on_bit_lock() where one is waiting for the bit to 862 * One uses wait_on_bit_lock() where one is waiting for the bit to
959 * clear with the intention of setting it, and when done, clearing it. 863 * clear with the intention of setting it, and when done, clearing it.
960 */ 864 */
961static inline int wait_on_bit_lock(void *word, int bit, 865static inline int
962 int (*action)(void *), unsigned mode) 866wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode)
963{ 867{
964 if (!test_and_set_bit(bit, word)) 868 if (!test_and_set_bit(bit, word))
965 return 0; 869 return 0;
@@ -983,5 +887,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
983 return 0; 887 return 0;
984 return out_of_line_wait_on_atomic_t(val, action, mode); 888 return out_of_line_wait_on_atomic_t(val, action, mode);
985} 889}
986 890
987#endif 891#endif /* _LINUX_WAIT_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 2e7d9947a10d..613381bcde40 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
100 /* 100 /*
101 * For all intents and purposes a preempted task is a running task. 101 * For all intents and purposes a preempted task is a running task.
102 */ 102 */
103 if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) 103 if (task_preempt_count(p) & PREEMPT_ACTIVE)
104 state = TASK_RUNNING | TASK_STATE_MAX; 104 state = TASK_RUNNING | TASK_STATE_MAX;
105#endif 105#endif
106 106
diff --git a/init/main.c b/init/main.c
index 63d3e8f2970c..379090fadac9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -693,7 +693,7 @@ int __init_or_module do_one_initcall(initcall_t fn)
693 693
694 if (preempt_count() != count) { 694 if (preempt_count() != count) {
695 sprintf(msgbuf, "preemption imbalance "); 695 sprintf(msgbuf, "preemption imbalance ");
696 preempt_count() = count; 696 preempt_count_set(count);
697 } 697 }
698 if (irqs_disabled()) { 698 if (irqs_disabled()) {
699 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); 699 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..e8ca97b5c386 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h> 12#include <linux/page_cgroup.h>
13#include <linux/log2.h>
13 14
14void foo(void) 15void foo(void)
15{ 16{
@@ -17,5 +18,8 @@ void foo(void)
17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 18 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 19 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); 20 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
21#ifdef CONFIG_SMP
22 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
23#endif
20 /* End of constants */ 24 /* End of constants */
21} 25}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 859c8dfd78a1..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
120 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
121 * calling the scheduler. 121 * calling the scheduler.
122 */ 122 */
123void __sched notrace preempt_schedule_context(void) 123asmlinkage void __sched notrace preempt_schedule_context(void)
124{ 124{
125 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
126 126
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
44 rcu_idle_enter(); 44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id()); 45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable(); 46 local_irq_enable();
47 while (!need_resched()) 47 while (!tif_need_resched())
48 cpu_relax(); 48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit(); 50 rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll(); 93 cpu_idle_poll();
94 } else { 94 } else {
95 current_clr_polling(); 95 if (!current_clr_polling_and_test()) {
96 if (!need_resched()) {
97 stop_critical_timings(); 96 stop_critical_timings();
98 rcu_idle_enter(); 97 rcu_idle_enter();
99 arch_cpu_idle(); 98 arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
103 } else { 102 } else {
104 local_irq_enable(); 103 local_irq_enable();
105 } 104 }
106 current_set_polling(); 105 __current_set_polling();
107 } 106 }
108 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
109 } 115 }
110 tick_nohz_idle_exit(); 116 tick_nohz_idle_exit();
111 schedule_preempt_disabled(); 117 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
129 */ 135 */
130 boot_init_stack_canary(); 136 boot_init_stack_canary();
131#endif 137#endif
132 current_set_polling(); 138 __current_set_polling();
133 arch_cpu_idle_prepare(); 139 arch_cpu_idle_prepare();
134 cpu_idle_loop(); 140 cpu_idle_loop();
135} 141}
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..c93be06dee87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
817#ifdef CONFIG_TRANSPARENT_HUGEPAGE 817#ifdef CONFIG_TRANSPARENT_HUGEPAGE
818 mm->pmd_huge_pte = NULL; 818 mm->pmd_huge_pte = NULL;
819#endif 819#endif
820#ifdef CONFIG_NUMA_BALANCING
821 mm->first_nid = NUMA_PTE_SCAN_INIT;
822#endif
823 if (!mm_init(mm, tsk)) 820 if (!mm_init(mm, tsk))
824 goto fail_nomem; 821 goto fail_nomem;
825 822
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313#endif 1310#endif
1314 1311
1315 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1316 sched_fork(p); 1313 sched_fork(clone_flags, p);
1317 1314
1318 retval = perf_event_init_task(p); 1315 retval = perf_event_init_task(p);
1319 if (retval) 1316 if (retval)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 32618b3fe4e6..1dc9f3604ad8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -898,6 +898,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
898 force_quiescent_state(rsp); /* Kick them all. */ 898 force_quiescent_state(rsp); /* Kick them all. */
899} 899}
900 900
901/*
902 * This function really isn't for public consumption, but RCU is special in
903 * that context switches can allow the state machine to make progress.
904 */
905extern void resched_cpu(int cpu);
906
901static void print_cpu_stall(struct rcu_state *rsp) 907static void print_cpu_stall(struct rcu_state *rsp)
902{ 908{
903 int cpu; 909 int cpu;
@@ -927,7 +933,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
927 3 * rcu_jiffies_till_stall_check() + 3; 933 3 * rcu_jiffies_till_stall_check() + 3;
928 raw_spin_unlock_irqrestore(&rnp->lock, flags); 934 raw_spin_unlock_irqrestore(&rnp->lock, flags);
929 935
930 set_need_resched(); /* kick ourselves to get things going. */ 936 /*
937 * Attempt to revive the RCU machinery by forcing a context switch.
938 *
939 * A context switch would normally allow the RCU state machine to make
940 * progress and it could be we're stuck in kernel space without context
941 * switches for an entirely unreasonable amount of time.
942 */
943 resched_cpu(smp_processor_id());
931} 944}
932 945
933static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 946static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..0c3feebcf112 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
513 * might also involve a cross-CPU call to trigger the scheduler on 513 * might also involve a cross-CPU call to trigger the scheduler on
514 * the target CPU. 514 * the target CPU.
515 */ 515 */
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p) 516void resched_task(struct task_struct *p)
518{ 517{
519 int cpu; 518 int cpu;
520 519
521 assert_raw_spin_locked(&task_rq(p)->lock); 520 lockdep_assert_held(&task_rq(p)->lock);
522 521
523 if (test_tsk_need_resched(p)) 522 if (test_tsk_need_resched(p))
524 return; 523 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
526 set_tsk_need_resched(p); 525 set_tsk_need_resched(p);
527 526
528 cpu = task_cpu(p); 527 cpu = task_cpu(p);
529 if (cpu == smp_processor_id()) 528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return; 530 return;
531 }
531 532
532 /* NEED_RESCHED must be visible before we test polling */ 533 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 534 smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 raw_spin_unlock_irqrestore(&rq->lock, flags);
547} 548}
548 549
550#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON 551#ifdef CONFIG_NO_HZ_COMMON
550/* 552/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers 553 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
693 } 695 }
694} 696}
695 697
696#else /* !CONFIG_SMP */
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif /* CONFIG_SMP */ 698#endif /* CONFIG_SMP */
703 699
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{ 764{
769 update_rq_clock(rq); 765 update_rq_clock(rq);
770 sched_info_queued(p); 766 sched_info_queued(rq, p);
771 p->sched_class->enqueue_task(rq, p, flags); 767 p->sched_class->enqueue_task(rq, p, flags);
772} 768}
773 769
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{ 771{
776 update_rq_clock(rq); 772 update_rq_clock(rq);
777 sched_info_dequeued(p); 773 sched_info_dequeued(rq, p);
778 p->sched_class->dequeue_task(rq, p, flags); 774 p->sched_class->dequeue_task(rq, p, flags);
779} 775}
780 776
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
987 * ttwu() will sort out the placement. 983 * ttwu() will sort out the placement.
988 */ 984 */
989 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
990 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
991 987
992#ifdef CONFIG_LOCKDEP 988#ifdef CONFIG_LOCKDEP
993 /* 989 /*
@@ -1017,6 +1013,102 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1017 __set_task_cpu(p, new_cpu); 1013 __set_task_cpu(p, new_cpu);
1018} 1014}
1019 1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029 /*
1030 * Task isn't running anymore; make it appear like we migrated
1031 * it before it went to sleep. This means on wakeup we make the
1032 * previous cpu our targer instead of where it really is.
1033 */
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_rq_lock(src_rq, dst_rq);
1053 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1054 goto unlock;
1055
1056 if (task_cpu(arg->src_task) != arg->src_cpu)
1057 goto unlock;
1058
1059 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1060 goto unlock;
1061
1062 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1063 goto unlock;
1064
1065 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1066 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1067
1068 ret = 0;
1069
1070unlock:
1071 double_rq_unlock(src_rq, dst_rq);
1072
1073 return ret;
1074}
1075
1076/*
1077 * Cross migrate two tasks
1078 */
1079int migrate_swap(struct task_struct *cur, struct task_struct *p)
1080{
1081 struct migration_swap_arg arg;
1082 int ret = -EINVAL;
1083
1084 get_online_cpus();
1085
1086 arg = (struct migration_swap_arg){
1087 .src_task = cur,
1088 .src_cpu = task_cpu(cur),
1089 .dst_task = p,
1090 .dst_cpu = task_cpu(p),
1091 };
1092
1093 if (arg.src_cpu == arg.dst_cpu)
1094 goto out;
1095
1096 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1097 goto out;
1098
1099 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1100 goto out;
1101
1102 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1103 goto out;
1104
1105 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1106
1107out:
1108 put_online_cpus();
1109 return ret;
1110}
1111
1020struct migration_arg { 1112struct migration_arg {
1021 struct task_struct *task; 1113 struct task_struct *task;
1022 int dest_cpu; 1114 int dest_cpu;
@@ -1236,9 +1328,9 @@ out:
1236 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1328 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1237 */ 1329 */
1238static inline 1330static inline
1239int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1331int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1240{ 1332{
1241 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1333 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1242 1334
1243 /* 1335 /*
1244 * In order not to call set_task_cpu() on a blocking task we need 1336 * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1422,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1330 1422
1331 if (rq->idle_stamp) { 1423 if (rq->idle_stamp) {
1332 u64 delta = rq_clock(rq) - rq->idle_stamp; 1424 u64 delta = rq_clock(rq) - rq->idle_stamp;
1333 u64 max = 2*sysctl_sched_migration_cost; 1425 u64 max = 2*rq->max_idle_balance_cost;
1334 1426
1335 if (delta > max) 1427 update_avg(&rq->avg_idle, delta);
1428
1429 if (rq->avg_idle > max)
1336 rq->avg_idle = max; 1430 rq->avg_idle = max;
1337 else 1431
1338 update_avg(&rq->avg_idle, delta);
1339 rq->idle_stamp = 0; 1432 rq->idle_stamp = 0;
1340 } 1433 }
1341#endif 1434#endif
@@ -1396,6 +1489,14 @@ static void sched_ttwu_pending(void)
1396 1489
1397void scheduler_ipi(void) 1490void scheduler_ipi(void)
1398{ 1491{
1492 /*
1493 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1494 * TIF_NEED_RESCHED remotely (for the first time) will also send
1495 * this IPI.
1496 */
1497 if (tif_need_resched())
1498 set_preempt_need_resched();
1499
1399 if (llist_empty(&this_rq()->wake_list) 1500 if (llist_empty(&this_rq()->wake_list)
1400 && !tick_nohz_full_cpu(smp_processor_id()) 1501 && !tick_nohz_full_cpu(smp_processor_id())
1401 && !got_nohz_idle_kick()) 1502 && !got_nohz_idle_kick())
@@ -1513,7 +1614,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1513 if (p->sched_class->task_waking) 1614 if (p->sched_class->task_waking)
1514 p->sched_class->task_waking(p); 1615 p->sched_class->task_waking(p);
1515 1616
1516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1617 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1517 if (task_cpu(p) != cpu) { 1618 if (task_cpu(p) != cpu) {
1518 wake_flags |= WF_MIGRATED; 1619 wake_flags |= WF_MIGRATED;
1519 set_task_cpu(p, cpu); 1620 set_task_cpu(p, cpu);
@@ -1595,7 +1696,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1595 * 1696 *
1596 * __sched_fork() is basic setup used by init_idle() too: 1697 * __sched_fork() is basic setup used by init_idle() too:
1597 */ 1698 */
1598static void __sched_fork(struct task_struct *p) 1699static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1599{ 1700{
1600 p->on_rq = 0; 1701 p->on_rq = 0;
1601 1702
@@ -1619,16 +1720,24 @@ static void __sched_fork(struct task_struct *p)
1619 1720
1620#ifdef CONFIG_NUMA_BALANCING 1721#ifdef CONFIG_NUMA_BALANCING
1621 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1722 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1622 p->mm->numa_next_scan = jiffies; 1723 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1623 p->mm->numa_next_reset = jiffies;
1624 p->mm->numa_scan_seq = 0; 1724 p->mm->numa_scan_seq = 0;
1625 } 1725 }
1626 1726
1727 if (clone_flags & CLONE_VM)
1728 p->numa_preferred_nid = current->numa_preferred_nid;
1729 else
1730 p->numa_preferred_nid = -1;
1731
1627 p->node_stamp = 0ULL; 1732 p->node_stamp = 0ULL;
1628 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1733 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1629 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1630 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1734 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1631 p->numa_work.next = &p->numa_work; 1735 p->numa_work.next = &p->numa_work;
1736 p->numa_faults = NULL;
1737 p->numa_faults_buffer = NULL;
1738
1739 INIT_LIST_HEAD(&p->numa_entry);
1740 p->numa_group = NULL;
1632#endif /* CONFIG_NUMA_BALANCING */ 1741#endif /* CONFIG_NUMA_BALANCING */
1633} 1742}
1634 1743
@@ -1654,12 +1763,12 @@ void set_numabalancing_state(bool enabled)
1654/* 1763/*
1655 * fork()/clone()-time setup: 1764 * fork()/clone()-time setup:
1656 */ 1765 */
1657void sched_fork(struct task_struct *p) 1766void sched_fork(unsigned long clone_flags, struct task_struct *p)
1658{ 1767{
1659 unsigned long flags; 1768 unsigned long flags;
1660 int cpu = get_cpu(); 1769 int cpu = get_cpu();
1661 1770
1662 __sched_fork(p); 1771 __sched_fork(clone_flags, p);
1663 /* 1772 /*
1664 * We mark the process as running here. This guarantees that 1773 * We mark the process as running here. This guarantees that
1665 * nobody will actually run it, and a signal or other external 1774 * nobody will actually run it, and a signal or other external
@@ -1717,10 +1826,7 @@ void sched_fork(struct task_struct *p)
1717#if defined(CONFIG_SMP) 1826#if defined(CONFIG_SMP)
1718 p->on_cpu = 0; 1827 p->on_cpu = 0;
1719#endif 1828#endif
1720#ifdef CONFIG_PREEMPT_COUNT 1829 init_task_preempt_count(p);
1721 /* Want to start with kernel preemption disabled. */
1722 task_thread_info(p)->preempt_count = 1;
1723#endif
1724#ifdef CONFIG_SMP 1830#ifdef CONFIG_SMP
1725 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1831 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1726#endif 1832#endif
@@ -1747,7 +1853,7 @@ void wake_up_new_task(struct task_struct *p)
1747 * - cpus_allowed can change in the fork path 1853 * - cpus_allowed can change in the fork path
1748 * - any previously selected cpu might disappear through hotplug 1854 * - any previously selected cpu might disappear through hotplug
1749 */ 1855 */
1750 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1856 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1751#endif 1857#endif
1752 1858
1753 /* Initialize new task's runnable average */ 1859 /* Initialize new task's runnable average */
@@ -1838,7 +1944,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next) 1944 struct task_struct *next)
1839{ 1945{
1840 trace_sched_switch(prev, next); 1946 trace_sched_switch(prev, next);
1841 sched_info_switch(prev, next); 1947 sched_info_switch(rq, prev, next);
1842 perf_event_task_sched_out(prev, next); 1948 perf_event_task_sched_out(prev, next);
1843 fire_sched_out_preempt_notifiers(prev, next); 1949 fire_sched_out_preempt_notifiers(prev, next);
1844 prepare_lock_switch(rq, next); 1950 prepare_lock_switch(rq, next);
@@ -1890,6 +1996,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1890 if (mm) 1996 if (mm)
1891 mmdrop(mm); 1997 mmdrop(mm);
1892 if (unlikely(prev_state == TASK_DEAD)) { 1998 if (unlikely(prev_state == TASK_DEAD)) {
1999 task_numa_free(prev);
2000
1893 /* 2001 /*
1894 * Remove function-return probe instances associated with this 2002 * Remove function-return probe instances associated with this
1895 * task and put them back on the free list. 2003 * task and put them back on the free list.
@@ -2073,7 +2181,7 @@ void sched_exec(void)
2073 int dest_cpu; 2181 int dest_cpu;
2074 2182
2075 raw_spin_lock_irqsave(&p->pi_lock, flags); 2183 raw_spin_lock_irqsave(&p->pi_lock, flags);
2076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2184 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2077 if (dest_cpu == smp_processor_id()) 2185 if (dest_cpu == smp_processor_id())
2078 goto unlock; 2186 goto unlock;
2079 2187
@@ -2215,7 +2323,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2215#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2323#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2216 defined(CONFIG_PREEMPT_TRACER)) 2324 defined(CONFIG_PREEMPT_TRACER))
2217 2325
2218void __kprobes add_preempt_count(int val) 2326void __kprobes preempt_count_add(int val)
2219{ 2327{
2220#ifdef CONFIG_DEBUG_PREEMPT 2328#ifdef CONFIG_DEBUG_PREEMPT
2221 /* 2329 /*
@@ -2224,7 +2332,7 @@ void __kprobes add_preempt_count(int val)
2224 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2332 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2225 return; 2333 return;
2226#endif 2334#endif
2227 preempt_count() += val; 2335 __preempt_count_add(val);
2228#ifdef CONFIG_DEBUG_PREEMPT 2336#ifdef CONFIG_DEBUG_PREEMPT
2229 /* 2337 /*
2230 * Spinlock count overflowing soon? 2338 * Spinlock count overflowing soon?
@@ -2235,9 +2343,9 @@ void __kprobes add_preempt_count(int val)
2235 if (preempt_count() == val) 2343 if (preempt_count() == val)
2236 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2344 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2237} 2345}
2238EXPORT_SYMBOL(add_preempt_count); 2346EXPORT_SYMBOL(preempt_count_add);
2239 2347
2240void __kprobes sub_preempt_count(int val) 2348void __kprobes preempt_count_sub(int val)
2241{ 2349{
2242#ifdef CONFIG_DEBUG_PREEMPT 2350#ifdef CONFIG_DEBUG_PREEMPT
2243 /* 2351 /*
@@ -2255,9 +2363,9 @@ void __kprobes sub_preempt_count(int val)
2255 2363
2256 if (preempt_count() == val) 2364 if (preempt_count() == val)
2257 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2365 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2258 preempt_count() -= val; 2366 __preempt_count_sub(val);
2259} 2367}
2260EXPORT_SYMBOL(sub_preempt_count); 2368EXPORT_SYMBOL(preempt_count_sub);
2261 2369
2262#endif 2370#endif
2263 2371
@@ -2430,6 +2538,7 @@ need_resched:
2430 put_prev_task(rq, prev); 2538 put_prev_task(rq, prev);
2431 next = pick_next_task(rq); 2539 next = pick_next_task(rq);
2432 clear_tsk_need_resched(prev); 2540 clear_tsk_need_resched(prev);
2541 clear_preempt_need_resched();
2433 rq->skip_clock_update = 0; 2542 rq->skip_clock_update = 0;
2434 2543
2435 if (likely(prev != next)) { 2544 if (likely(prev != next)) {
@@ -2520,9 +2629,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
2520 return; 2629 return;
2521 2630
2522 do { 2631 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE); 2632 __preempt_count_add(PREEMPT_ACTIVE);
2524 __schedule(); 2633 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2634 __preempt_count_sub(PREEMPT_ACTIVE);
2526 2635
2527 /* 2636 /*
2528 * Check again in case we missed a preemption opportunity 2637 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2650,19 @@ EXPORT_SYMBOL(preempt_schedule);
2541 */ 2650 */
2542asmlinkage void __sched preempt_schedule_irq(void) 2651asmlinkage void __sched preempt_schedule_irq(void)
2543{ 2652{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state; 2653 enum ctx_state prev_state;
2546 2654
2547 /* Catch callers which need to be fixed */ 2655 /* Catch callers which need to be fixed */
2548 BUG_ON(ti->preempt_count || !irqs_disabled()); 2656 BUG_ON(preempt_count() || !irqs_disabled());
2549 2657
2550 prev_state = exception_enter(); 2658 prev_state = exception_enter();
2551 2659
2552 do { 2660 do {
2553 add_preempt_count(PREEMPT_ACTIVE); 2661 __preempt_count_add(PREEMPT_ACTIVE);
2554 local_irq_enable(); 2662 local_irq_enable();
2555 __schedule(); 2663 __schedule();
2556 local_irq_disable(); 2664 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE); 2665 __preempt_count_sub(PREEMPT_ACTIVE);
2558 2666
2559 /* 2667 /*
2560 * Check again in case we missed a preemption opportunity 2668 * Check again in case we missed a preemption opportunity
@@ -3794,16 +3902,11 @@ SYSCALL_DEFINE0(sched_yield)
3794 return 0; 3902 return 0;
3795} 3903}
3796 3904
3797static inline int should_resched(void)
3798{
3799 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3800}
3801
3802static void __cond_resched(void) 3905static void __cond_resched(void)
3803{ 3906{
3804 add_preempt_count(PREEMPT_ACTIVE); 3907 __preempt_count_add(PREEMPT_ACTIVE);
3805 __schedule(); 3908 __schedule();
3806 sub_preempt_count(PREEMPT_ACTIVE); 3909 __preempt_count_sub(PREEMPT_ACTIVE);
3807} 3910}
3808 3911
3809int __sched _cond_resched(void) 3912int __sched _cond_resched(void)
@@ -4186,7 +4289,7 @@ void init_idle(struct task_struct *idle, int cpu)
4186 4289
4187 raw_spin_lock_irqsave(&rq->lock, flags); 4290 raw_spin_lock_irqsave(&rq->lock, flags);
4188 4291
4189 __sched_fork(idle); 4292 __sched_fork(0, idle);
4190 idle->state = TASK_RUNNING; 4293 idle->state = TASK_RUNNING;
4191 idle->se.exec_start = sched_clock(); 4294 idle->se.exec_start = sched_clock();
4192 4295
@@ -4212,7 +4315,7 @@ void init_idle(struct task_struct *idle, int cpu)
4212 raw_spin_unlock_irqrestore(&rq->lock, flags); 4315 raw_spin_unlock_irqrestore(&rq->lock, flags);
4213 4316
4214 /* Set the preempt count _outside_ the spinlocks! */ 4317 /* Set the preempt count _outside_ the spinlocks! */
4215 task_thread_info(idle)->preempt_count = 0; 4318 init_idle_preempt_count(idle, cpu);
4216 4319
4217 /* 4320 /*
4218 * The idle tasks have their own, simple scheduling class: 4321 * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4449,53 @@ fail:
4346 return ret; 4449 return ret;
4347} 4450}
4348 4451
4452#ifdef CONFIG_NUMA_BALANCING
4453/* Migrate current task p to target_cpu */
4454int migrate_task_to(struct task_struct *p, int target_cpu)
4455{
4456 struct migration_arg arg = { p, target_cpu };
4457 int curr_cpu = task_cpu(p);
4458
4459 if (curr_cpu == target_cpu)
4460 return 0;
4461
4462 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4463 return -EINVAL;
4464
4465 /* TODO: This is not properly updating schedstats */
4466
4467 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4468}
4469
4470/*
4471 * Requeue a task on a given node and accurately track the number of NUMA
4472 * tasks on the runqueues
4473 */
4474void sched_setnuma(struct task_struct *p, int nid)
4475{
4476 struct rq *rq;
4477 unsigned long flags;
4478 bool on_rq, running;
4479
4480 rq = task_rq_lock(p, &flags);
4481 on_rq = p->on_rq;
4482 running = task_current(rq, p);
4483
4484 if (on_rq)
4485 dequeue_task(rq, p, 0);
4486 if (running)
4487 p->sched_class->put_prev_task(rq, p);
4488
4489 p->numa_preferred_nid = nid;
4490
4491 if (running)
4492 p->sched_class->set_curr_task(rq);
4493 if (on_rq)
4494 enqueue_task(rq, p, 0);
4495 task_rq_unlock(rq, p, &flags);
4496}
4497#endif
4498
4349/* 4499/*
4350 * migration_cpu_stop - this will be executed by a highprio stopper thread 4500 * migration_cpu_stop - this will be executed by a highprio stopper thread
4351 * and performs thread migration by bumping thread off CPU then 4501 * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5269,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5119DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5269DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size); 5270DEFINE_PER_CPU(int, sd_llc_size);
5121DEFINE_PER_CPU(int, sd_llc_id); 5271DEFINE_PER_CPU(int, sd_llc_id);
5272DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5122 5273
5123static void update_top_cache_domain(int cpu) 5274static void update_top_cache_domain(int cpu)
5124{ 5275{
@@ -5135,6 +5286,9 @@ static void update_top_cache_domain(int cpu)
5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5286 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size; 5287 per_cpu(sd_llc_size, cpu) = size;
5137 per_cpu(sd_llc_id, cpu) = id; 5288 per_cpu(sd_llc_id, cpu) = id;
5289
5290 sd = lowest_flag_domain(cpu, SD_NUMA);
5291 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5138} 5292}
5139 5293
5140/* 5294/*
@@ -5654,6 +5808,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5654 | 0*SD_SHARE_PKG_RESOURCES 5808 | 0*SD_SHARE_PKG_RESOURCES
5655 | 1*SD_SERIALIZE 5809 | 1*SD_SERIALIZE
5656 | 0*SD_PREFER_SIBLING 5810 | 0*SD_PREFER_SIBLING
5811 | 1*SD_NUMA
5657 | sd_local_flags(level) 5812 | sd_local_flags(level)
5658 , 5813 ,
5659 .last_balance = jiffies, 5814 .last_balance = jiffies,
@@ -6505,6 +6660,7 @@ void __init sched_init(void)
6505 rq->online = 0; 6660 rq->online = 0;
6506 rq->idle_stamp = 0; 6661 rq->idle_stamp = 0;
6507 rq->avg_idle = 2*sysctl_sched_migration_cost; 6662 rq->avg_idle = 2*sysctl_sched_migration_cost;
6663 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6508 6664
6509 INIT_LIST_HEAD(&rq->cfs_tasks); 6665 INIT_LIST_HEAD(&rq->cfs_tasks);
6510 6666
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 196559994f7c..e6ba5e31c7ca 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h>
18 19
19#include "sched.h" 20#include "sched.h"
20 21
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
137 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 138 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
138 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
139#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
143#endif
140#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
141 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
142#endif 146#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 read_lock_irqsave(&tasklist_lock, flags); 163 read_lock_irqsave(&tasklist_lock, flags);
160 164
161 do_each_thread(g, p) { 165 do_each_thread(g, p) {
162 if (!p->on_rq || task_cpu(p) != rq_cpu) 166 if (task_cpu(p) != rq_cpu)
163 continue; 167 continue;
164 168
165 print_task(m, rq, p); 169 print_task(m, rq, p);
@@ -345,7 +349,7 @@ static void sched_debug_header(struct seq_file *m)
345 cpu_clk = local_clock(); 349 cpu_clk = local_clock();
346 local_irq_restore(flags); 350 local_irq_restore(flags);
347 351
348 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", 352 SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
349 init_utsname()->release, 353 init_utsname()->release,
350 (int)strcspn(init_utsname()->version, " "), 354 (int)strcspn(init_utsname()->version, " "),
351 init_utsname()->version); 355 init_utsname()->version);
@@ -488,6 +492,56 @@ static int __init init_sched_debug_procfs(void)
488 492
489__initcall(init_sched_debug_procfs); 493__initcall(init_sched_debug_procfs);
490 494
495#define __P(F) \
496 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
497#define P(F) \
498 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
499#define __PN(F) \
500 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
501#define PN(F) \
502 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
503
504
505static void sched_show_numa(struct task_struct *p, struct seq_file *m)
506{
507#ifdef CONFIG_NUMA_BALANCING
508 struct mempolicy *pol;
509 int node, i;
510
511 if (p->mm)
512 P(mm->numa_scan_seq);
513
514 task_lock(p);
515 pol = p->mempolicy;
516 if (pol && !(pol->flags & MPOL_F_MORON))
517 pol = NULL;
518 mpol_get(pol);
519 task_unlock(p);
520
521 SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
522
523 for_each_online_node(node) {
524 for (i = 0; i < 2; i++) {
525 unsigned long nr_faults = -1;
526 int cpu_current, home_node;
527
528 if (p->numa_faults)
529 nr_faults = p->numa_faults[2*node + i];
530
531 cpu_current = !i ? (task_node(p) == node) :
532 (pol && node_isset(node, pol->v.nodes));
533
534 home_node = (p->numa_preferred_nid == node);
535
536 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
537 i, node, cpu_current, home_node, nr_faults);
538 }
539 }
540
541 mpol_put(pol);
542#endif
543}
544
491void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 545void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 546{
493 unsigned long nr_switches; 547 unsigned long nr_switches;
@@ -591,6 +645,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
591 SEQ_printf(m, "%-45s:%21Ld\n", 645 SEQ_printf(m, "%-45s:%21Ld\n",
592 "clock-delta", (long long)(t1-t0)); 646 "clock-delta", (long long)(t1-t0));
593 } 647 }
648
649 sched_show_numa(p, m);
594} 650}
595 651
596void proc_sched_set_task(struct task_struct *p) 652void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..803e343d7c89 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 820
819#ifdef CONFIG_NUMA_BALANCING 821#ifdef CONFIG_NUMA_BALANCING
820/* 822/*
821 * numa task sample period in ms 823 * Approximate time to scan a full NUMA task in ms. The task scan period is
824 * calculated based on the tasks virtual memory size and
825 * numa_balancing_scan_size.
822 */ 826 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 827unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 828unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 829
827/* Portion of address space to scan in MB */ 830/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 831unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,819 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 835
833static void task_numa_placement(struct task_struct *p) 836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
844static unsigned int task_nr_scan_windows(struct task_struct *p)
845{
846 unsigned long rss = 0;
847 unsigned long nr_scan_pages;
848
849 /*
850 * Calculations based on RSS as non-present and empty pages are skipped
851 * by the PTE scanner and NUMA hinting faults should be trapped based
852 * on resident pages
853 */
854 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
855 rss = get_mm_rss(p->mm);
856 if (!rss)
857 rss = nr_scan_pages;
858
859 rss = round_up(rss, nr_scan_pages);
860 return rss / nr_scan_pages;
861}
862
863/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
864#define MAX_SCAN_WINDOW 2560
865
866static unsigned int task_scan_min(struct task_struct *p)
867{
868 unsigned int scan, floor;
869 unsigned int windows = 1;
870
871 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
872 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
873 floor = 1000 / windows;
874
875 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
876 return max_t(unsigned int, floor, scan);
877}
878
879static unsigned int task_scan_max(struct task_struct *p)
880{
881 unsigned int smin = task_scan_min(p);
882 unsigned int smax;
883
884 /* Watch for min being lower than max due to floor calculations */
885 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
886 return max(smin, smax);
887}
888
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1);
901 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
902}
903
904static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
905{
906 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
907 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
908}
909
910struct numa_group {
911 atomic_t refcount;
912
913 spinlock_t lock; /* nr_tasks, tasks */
914 int nr_tasks;
915 pid_t gid;
916 struct list_head task_list;
917
918 struct rcu_head rcu;
919 unsigned long total_faults;
920 unsigned long faults[0];
921};
922
923pid_t task_numa_group_id(struct task_struct *p)
924{
925 return p->numa_group ? p->numa_group->gid : 0;
926}
927
928static inline int task_faults_idx(int nid, int priv)
929{
930 return 2 * nid + priv;
931}
932
933static inline unsigned long task_faults(struct task_struct *p, int nid)
934{
935 if (!p->numa_faults)
936 return 0;
937
938 return p->numa_faults[task_faults_idx(nid, 0)] +
939 p->numa_faults[task_faults_idx(nid, 1)];
940}
941
942static inline unsigned long group_faults(struct task_struct *p, int nid)
943{
944 if (!p->numa_group)
945 return 0;
946
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
948}
949
950/*
951 * These return the fraction of accesses done by a particular task, or
952 * task group, on a particular numa node. The group weight is given a
953 * larger multiplier, in order to group tasks together that are almost
954 * evenly spread out between numa nodes.
955 */
956static inline unsigned long task_weight(struct task_struct *p, int nid)
957{
958 unsigned long total_faults;
959
960 if (!p->numa_faults)
961 return 0;
962
963 total_faults = p->total_numa_faults;
964
965 if (!total_faults)
966 return 0;
967
968 return 1000 * task_faults(p, nid) / total_faults;
969}
970
971static inline unsigned long group_weight(struct task_struct *p, int nid)
972{
973 if (!p->numa_group || !p->numa_group->total_faults)
974 return 0;
975
976 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
977}
978
979static unsigned long weighted_cpuload(const int cpu);
980static unsigned long source_load(int cpu, int type);
981static unsigned long target_load(int cpu, int type);
982static unsigned long power_of(int cpu);
983static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
984
985/* Cached statistics for all CPUs within a node */
986struct numa_stats {
987 unsigned long nr_running;
988 unsigned long load;
989
990 /* Total compute capacity of CPUs on a node */
991 unsigned long power;
992
993 /* Approximate capacity in terms of runnable tasks on a node */
994 unsigned long capacity;
995 int has_capacity;
996};
997
998/*
999 * XXX borrowed from update_sg_lb_stats
1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{
1003 int cpu;
1004
1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 struct rq *rq = cpu_rq(cpu);
1008
1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu);
1012 }
1013
1014 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 ns->has_capacity = (ns->nr_running < ns->capacity);
1017}
1018
1019struct task_numa_env {
1020 struct task_struct *p;
1021
1022 int src_cpu, src_nid;
1023 int dst_cpu, dst_nid;
1024
1025 struct numa_stats src_stats, dst_stats;
1026
1027 int imbalance_pct, idx;
1028
1029 struct task_struct *best_task;
1030 long best_imp;
1031 int best_cpu;
1032};
1033
1034static void task_numa_assign(struct task_numa_env *env,
1035 struct task_struct *p, long imp)
834{ 1036{
835 int seq; 1037 if (env->best_task)
1038 put_task_struct(env->best_task);
1039 if (p)
1040 get_task_struct(p);
1041
1042 env->best_task = p;
1043 env->best_imp = imp;
1044 env->best_cpu = env->dst_cpu;
1045}
1046
1047/*
1048 * This checks if the overall compute and NUMA accesses of the system would
1049 * be improved if the source tasks was migrated to the target dst_cpu taking
1050 * into account that it might be best if task running on the dst_cpu should
1051 * be exchanged with the source task
1052 */
1053static void task_numa_compare(struct task_numa_env *env,
1054 long taskimp, long groupimp)
1055{
1056 struct rq *src_rq = cpu_rq(env->src_cpu);
1057 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1058 struct task_struct *cur;
1059 long dst_load, src_load;
1060 long load;
1061 long imp = (groupimp > 0) ? groupimp : taskimp;
1062
1063 rcu_read_lock();
1064 cur = ACCESS_ONCE(dst_rq->curr);
1065 if (cur->pid == 0) /* idle */
1066 cur = NULL;
1067
1068 /*
1069 * "imp" is the fault differential for the source task between the
1070 * source and destination node. Calculate the total differential for
1071 * the source task and potential destination task. The more negative
1072 * the value is, the more rmeote accesses that would be expected to
1073 * be incurred if the tasks were swapped.
1074 */
1075 if (cur) {
1076 /* Skip this swap candidate if cannot move to the source cpu */
1077 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1078 goto unlock;
1079
1080 /*
1081 * If dst and source tasks are in the same NUMA group, or not
1082 * in any group then look only at task weights.
1083 */
1084 if (cur->numa_group == env->p->numa_group) {
1085 imp = taskimp + task_weight(cur, env->src_nid) -
1086 task_weight(cur, env->dst_nid);
1087 /*
1088 * Add some hysteresis to prevent swapping the
1089 * tasks within a group over tiny differences.
1090 */
1091 if (cur->numa_group)
1092 imp -= imp/16;
1093 } else {
1094 /*
1095 * Compare the group weights. If a task is all by
1096 * itself (not part of a group), use the task weight
1097 * instead.
1098 */
1099 if (env->p->numa_group)
1100 imp = groupimp;
1101 else
1102 imp = taskimp;
1103
1104 if (cur->numa_group)
1105 imp += group_weight(cur, env->src_nid) -
1106 group_weight(cur, env->dst_nid);
1107 else
1108 imp += task_weight(cur, env->src_nid) -
1109 task_weight(cur, env->dst_nid);
1110 }
1111 }
1112
1113 if (imp < env->best_imp)
1114 goto unlock;
1115
1116 if (!cur) {
1117 /* Is there capacity at our destination? */
1118 if (env->src_stats.has_capacity &&
1119 !env->dst_stats.has_capacity)
1120 goto unlock;
1121
1122 goto balance;
1123 }
1124
1125 /* Balance doesn't matter much if we're running a task per cpu */
1126 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1127 goto assign;
836 1128
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */ 1129 /*
1130 * In the overloaded case, try and keep the load balanced.
1131 */
1132balance:
1133 dst_load = env->dst_stats.load;
1134 src_load = env->src_stats.load;
1135
1136 /* XXX missing power terms */
1137 load = task_h_load(env->p);
1138 dst_load += load;
1139 src_load -= load;
1140
1141 if (cur) {
1142 load = task_h_load(cur);
1143 dst_load -= load;
1144 src_load += load;
1145 }
1146
1147 /* make src_load the smaller */
1148 if (dst_load < src_load)
1149 swap(dst_load, src_load);
1150
1151 if (src_load * env->imbalance_pct < dst_load * 100)
1152 goto unlock;
1153
1154assign:
1155 task_numa_assign(env, cur, imp);
1156unlock:
1157 rcu_read_unlock();
1158}
1159
1160static void task_numa_find_cpu(struct task_numa_env *env,
1161 long taskimp, long groupimp)
1162{
1163 int cpu;
1164
1165 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1166 /* Skip this CPU if the source task cannot migrate */
1167 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1168 continue;
1169
1170 env->dst_cpu = cpu;
1171 task_numa_compare(env, taskimp, groupimp);
1172 }
1173}
1174
1175static int task_numa_migrate(struct task_struct *p)
1176{
1177 struct task_numa_env env = {
1178 .p = p,
1179
1180 .src_cpu = task_cpu(p),
1181 .src_nid = task_node(p),
1182
1183 .imbalance_pct = 112,
1184
1185 .best_task = NULL,
1186 .best_imp = 0,
1187 .best_cpu = -1
1188 };
1189 struct sched_domain *sd;
1190 unsigned long taskweight, groupweight;
1191 int nid, ret;
1192 long taskimp, groupimp;
1193
1194 /*
1195 * Pick the lowest SD_NUMA domain, as that would have the smallest
1196 * imbalance and would be the first to start moving tasks about.
1197 *
1198 * And we want to avoid any moving of tasks about, as that would create
1199 * random movement of tasks -- counter the numa conditions we're trying
1200 * to satisfy here.
1201 */
1202 rcu_read_lock();
1203 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207 taskweight = task_weight(p, env.src_nid);
1208 groupweight = group_weight(p, env.src_nid);
1209 update_numa_stats(&env.src_stats, env.src_nid);
1210 env.dst_nid = p->numa_preferred_nid;
1211 taskimp = task_weight(p, env.dst_nid) - taskweight;
1212 groupimp = group_weight(p, env.dst_nid) - groupweight;
1213 update_numa_stats(&env.dst_stats, env.dst_nid);
1214
1215 /* If the preferred nid has capacity, try to use it. */
1216 if (env.dst_stats.has_capacity)
1217 task_numa_find_cpu(&env, taskimp, groupimp);
1218
1219 /* No space available on the preferred nid. Look elsewhere. */
1220 if (env.best_cpu == -1) {
1221 for_each_online_node(nid) {
1222 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1223 continue;
1224
1225 /* Only consider nodes where both task and groups benefit */
1226 taskimp = task_weight(p, nid) - taskweight;
1227 groupimp = group_weight(p, nid) - groupweight;
1228 if (taskimp < 0 && groupimp < 0)
1229 continue;
1230
1231 env.dst_nid = nid;
1232 update_numa_stats(&env.dst_stats, env.dst_nid);
1233 task_numa_find_cpu(&env, taskimp, groupimp);
1234 }
1235 }
1236
1237 /* No better CPU than the current one was found. */
1238 if (env.best_cpu == -1)
1239 return -EAGAIN;
1240
1241 sched_setnuma(p, env.dst_nid);
1242
1243 /*
1244 * Reset the scan period if the task is being rescheduled on an
1245 * alternative node to recheck if the tasks is now properly placed.
1246 */
1247 p->numa_scan_period = task_scan_min(p);
1248
1249 if (env.best_task == NULL) {
1250 int ret = migrate_task_to(p, env.best_cpu);
1251 return ret;
1252 }
1253
1254 ret = migrate_swap(p, env.best_task);
1255 put_task_struct(env.best_task);
1256 return ret;
1257}
1258
1259/* Attempt to migrate a task to a CPU on the preferred node. */
1260static void numa_migrate_preferred(struct task_struct *p)
1261{
1262 /* This task has no NUMA fault statistics yet */
1263 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1264 return;
1265
1266 /* Periodically retry migrating the task to the preferred node */
1267 p->numa_migrate_retry = jiffies + HZ;
1268
1269 /* Success if task is already running on preferred CPU */
1270 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
838 return; 1271 return;
1272
1273 /* Otherwise, try migrate to a CPU on the preferred node */
1274 task_numa_migrate(p);
1275}
1276
1277/*
1278 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1279 * increments. The more local the fault statistics are, the higher the scan
1280 * period will be for the next scan window. If local/remote ratio is below
1281 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1282 * scan period will decrease
1283 */
1284#define NUMA_PERIOD_SLOTS 10
1285#define NUMA_PERIOD_THRESHOLD 3
1286
1287/*
1288 * Increase the scan period (slow down scanning) if the majority of
1289 * our memory is already on our local node, or if the majority of
1290 * the page accesses are shared with other processes.
1291 * Otherwise, decrease the scan period.
1292 */
1293static void update_task_scan_period(struct task_struct *p,
1294 unsigned long shared, unsigned long private)
1295{
1296 unsigned int period_slot;
1297 int ratio;
1298 int diff;
1299
1300 unsigned long remote = p->numa_faults_locality[0];
1301 unsigned long local = p->numa_faults_locality[1];
1302
1303 /*
1304 * If there were no record hinting faults then either the task is
1305 * completely idle or all activity is areas that are not of interest
1306 * to automatic numa balancing. Scan slower
1307 */
1308 if (local + shared == 0) {
1309 p->numa_scan_period = min(p->numa_scan_period_max,
1310 p->numa_scan_period << 1);
1311
1312 p->mm->numa_next_scan = jiffies +
1313 msecs_to_jiffies(p->numa_scan_period);
1314
1315 return;
1316 }
1317
1318 /*
1319 * Prepare to scale scan period relative to the current period.
1320 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1321 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1322 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1323 */
1324 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1325 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1326 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1327 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1328 if (!slot)
1329 slot = 1;
1330 diff = slot * period_slot;
1331 } else {
1332 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1333
1334 /*
1335 * Scale scan rate increases based on sharing. There is an
1336 * inverse relationship between the degree of sharing and
1337 * the adjustment made to the scanning period. Broadly
1338 * speaking the intent is that there is little point
1339 * scanning faster if shared accesses dominate as it may
1340 * simply bounce migrations uselessly
1341 */
1342 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1343 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1344 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1345 }
1346
1347 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1348 task_scan_min(p), task_scan_max(p));
1349 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1350}
1351
1352static void task_numa_placement(struct task_struct *p)
1353{
1354 int seq, nid, max_nid = -1, max_group_nid = -1;
1355 unsigned long max_faults = 0, max_group_faults = 0;
1356 unsigned long fault_types[2] = { 0, 0 };
1357 spinlock_t *group_lock = NULL;
1358
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1359 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1360 if (p->numa_scan_seq == seq)
841 return; 1361 return;
842 p->numa_scan_seq = seq; 1362 p->numa_scan_seq = seq;
1363 p->numa_scan_period_max = task_scan_max(p);
1364
1365 /* If the task is part of a group prevent parallel updates to group stats */
1366 if (p->numa_group) {
1367 group_lock = &p->numa_group->lock;
1368 spin_lock(group_lock);
1369 }
1370
1371 /* Find the node with the highest number of faults */
1372 for_each_online_node(nid) {
1373 unsigned long faults = 0, group_faults = 0;
1374 int priv, i;
1375
1376 for (priv = 0; priv < 2; priv++) {
1377 long diff;
1378
1379 i = task_faults_idx(nid, priv);
1380 diff = -p->numa_faults[i];
1381
1382 /* Decay existing window, copy faults since last scan */
1383 p->numa_faults[i] >>= 1;
1384 p->numa_faults[i] += p->numa_faults_buffer[i];
1385 fault_types[priv] += p->numa_faults_buffer[i];
1386 p->numa_faults_buffer[i] = 0;
1387
1388 faults += p->numa_faults[i];
1389 diff += p->numa_faults[i];
1390 p->total_numa_faults += diff;
1391 if (p->numa_group) {
1392 /* safe because we can only change our own group */
1393 p->numa_group->faults[i] += diff;
1394 p->numa_group->total_faults += diff;
1395 group_faults += p->numa_group->faults[i];
1396 }
1397 }
1398
1399 if (faults > max_faults) {
1400 max_faults = faults;
1401 max_nid = nid;
1402 }
1403
1404 if (group_faults > max_group_faults) {
1405 max_group_faults = group_faults;
1406 max_group_nid = nid;
1407 }
1408 }
1409
1410 update_task_scan_period(p, fault_types[0], fault_types[1]);
1411
1412 if (p->numa_group) {
1413 /*
1414 * If the preferred task and group nids are different,
1415 * iterate over the nodes again to find the best place.
1416 */
1417 if (max_nid != max_group_nid) {
1418 unsigned long weight, max_weight = 0;
1419
1420 for_each_online_node(nid) {
1421 weight = task_weight(p, nid) + group_weight(p, nid);
1422 if (weight > max_weight) {
1423 max_weight = weight;
1424 max_nid = nid;
1425 }
1426 }
1427 }
1428
1429 spin_unlock(group_lock);
1430 }
1431
1432 /* Preferred node as the node with the most faults */
1433 if (max_faults && max_nid != p->numa_preferred_nid) {
1434 /* Update the preferred nid and migrate task if possible */
1435 sched_setnuma(p, max_nid);
1436 numa_migrate_preferred(p);
1437 }
1438}
1439
1440static inline int get_numa_group(struct numa_group *grp)
1441{
1442 return atomic_inc_not_zero(&grp->refcount);
1443}
1444
1445static inline void put_numa_group(struct numa_group *grp)
1446{
1447 if (atomic_dec_and_test(&grp->refcount))
1448 kfree_rcu(grp, rcu);
1449}
1450
1451static void double_lock(spinlock_t *l1, spinlock_t *l2)
1452{
1453 if (l1 > l2)
1454 swap(l1, l2);
1455
1456 spin_lock(l1);
1457 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1458}
1459
1460static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1461 int *priv)
1462{
1463 struct numa_group *grp, *my_grp;
1464 struct task_struct *tsk;
1465 bool join = false;
1466 int cpu = cpupid_to_cpu(cpupid);
1467 int i;
1468
1469 if (unlikely(!p->numa_group)) {
1470 unsigned int size = sizeof(struct numa_group) +
1471 2*nr_node_ids*sizeof(unsigned long);
1472
1473 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1474 if (!grp)
1475 return;
1476
1477 atomic_set(&grp->refcount, 1);
1478 spin_lock_init(&grp->lock);
1479 INIT_LIST_HEAD(&grp->task_list);
1480 grp->gid = p->pid;
1481
1482 for (i = 0; i < 2*nr_node_ids; i++)
1483 grp->faults[i] = p->numa_faults[i];
1484
1485 grp->total_faults = p->total_numa_faults;
1486
1487 list_add(&p->numa_entry, &grp->task_list);
1488 grp->nr_tasks++;
1489 rcu_assign_pointer(p->numa_group, grp);
1490 }
1491
1492 rcu_read_lock();
1493 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1494
1495 if (!cpupid_match_pid(tsk, cpupid))
1496 goto no_join;
1497
1498 grp = rcu_dereference(tsk->numa_group);
1499 if (!grp)
1500 goto no_join;
1501
1502 my_grp = p->numa_group;
1503 if (grp == my_grp)
1504 goto no_join;
1505
1506 /*
1507 * Only join the other group if its bigger; if we're the bigger group,
1508 * the other task will join us.
1509 */
1510 if (my_grp->nr_tasks > grp->nr_tasks)
1511 goto no_join;
1512
1513 /*
1514 * Tie-break on the grp address.
1515 */
1516 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1517 goto no_join;
1518
1519 /* Always join threads in the same process. */
1520 if (tsk->mm == current->mm)
1521 join = true;
1522
1523 /* Simple filter to avoid false positives due to PID collisions */
1524 if (flags & TNF_SHARED)
1525 join = true;
1526
1527 /* Update priv based on whether false sharing was detected */
1528 *priv = !join;
1529
1530 if (join && !get_numa_group(grp))
1531 goto no_join;
1532
1533 rcu_read_unlock();
843 1534
844 /* FIXME: Scheduling placement policy hints go here */ 1535 if (!join)
1536 return;
1537
1538 double_lock(&my_grp->lock, &grp->lock);
1539
1540 for (i = 0; i < 2*nr_node_ids; i++) {
1541 my_grp->faults[i] -= p->numa_faults[i];
1542 grp->faults[i] += p->numa_faults[i];
1543 }
1544 my_grp->total_faults -= p->total_numa_faults;
1545 grp->total_faults += p->total_numa_faults;
1546
1547 list_move(&p->numa_entry, &grp->task_list);
1548 my_grp->nr_tasks--;
1549 grp->nr_tasks++;
1550
1551 spin_unlock(&my_grp->lock);
1552 spin_unlock(&grp->lock);
1553
1554 rcu_assign_pointer(p->numa_group, grp);
1555
1556 put_numa_group(my_grp);
1557 return;
1558
1559no_join:
1560 rcu_read_unlock();
1561 return;
1562}
1563
1564void task_numa_free(struct task_struct *p)
1565{
1566 struct numa_group *grp = p->numa_group;
1567 int i;
1568 void *numa_faults = p->numa_faults;
1569
1570 if (grp) {
1571 spin_lock(&grp->lock);
1572 for (i = 0; i < 2*nr_node_ids; i++)
1573 grp->faults[i] -= p->numa_faults[i];
1574 grp->total_faults -= p->total_numa_faults;
1575
1576 list_del(&p->numa_entry);
1577 grp->nr_tasks--;
1578 spin_unlock(&grp->lock);
1579 rcu_assign_pointer(p->numa_group, NULL);
1580 put_numa_group(grp);
1581 }
1582
1583 p->numa_faults = NULL;
1584 p->numa_faults_buffer = NULL;
1585 kfree(numa_faults);
845} 1586}
846 1587
847/* 1588/*
848 * Got a PROT_NONE fault for a page on @node. 1589 * Got a PROT_NONE fault for a page on @node.
849 */ 1590 */
850void task_numa_fault(int node, int pages, bool migrated) 1591void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1592{
852 struct task_struct *p = current; 1593 struct task_struct *p = current;
1594 bool migrated = flags & TNF_MIGRATED;
1595 int priv;
853 1596
854 if (!numabalancing_enabled) 1597 if (!numabalancing_enabled)
855 return; 1598 return;
856 1599
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1600 /* for example, ksmd faulting in a user's mm */
1601 if (!p->mm)
1602 return;
1603
1604 /* Do not worry about placement if exiting */
1605 if (p->state == TASK_DEAD)
1606 return;
1607
1608 /* Allocate buffer to track faults on a per-node basis */
1609 if (unlikely(!p->numa_faults)) {
1610 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1611
1612 /* numa_faults and numa_faults_buffer share the allocation */
1613 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1614 if (!p->numa_faults)
1615 return;
1616
1617 BUG_ON(p->numa_faults_buffer);
1618 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1619 p->total_numa_faults = 0;
1620 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1621 }
858 1622
859 /* 1623 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1624 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1625 * to be private if the accessing pid has not changed
862 */ 1626 */
863 if (!migrated) 1627 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1628 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1629 } else {
1630 priv = cpupid_match_pid(p, last_cpupid);
1631 if (!priv && !(flags & TNF_NO_GROUP))
1632 task_numa_group(p, last_cpupid, flags, &priv);
1633 }
866 1634
867 task_numa_placement(p); 1635 task_numa_placement(p);
1636
1637 /*
1638 * Retry task to preferred node migration periodically, in case it
1639 * case it previously failed, or the scheduler moved us.
1640 */
1641 if (time_after(jiffies, p->numa_migrate_retry))
1642 numa_migrate_preferred(p);
1643
1644 if (migrated)
1645 p->numa_pages_migrated += pages;
1646
1647 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1648 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1649}
869 1650
870static void reset_ptenuma_scan(struct task_struct *p) 1651static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1665,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1665 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1666 struct vm_area_struct *vma;
886 unsigned long start, end; 1667 unsigned long start, end;
1668 unsigned long nr_pte_updates = 0;
887 long pages; 1669 long pages;
888 1670
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1671 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1682,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1682 if (p->flags & PF_EXITING)
901 return; 1683 return;
902 1684
903 /* 1685 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1686 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1687 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1688 }
933 1689
934 /* 1690 /*
@@ -938,20 +1694,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1694 if (time_before(now, migrate))
939 return; 1695 return;
940 1696
941 if (p->numa_scan_period == 0) 1697 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1698 p->numa_scan_period_max = task_scan_max(p);
1699 p->numa_scan_period = task_scan_min(p);
1700 }
943 1701
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1702 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1703 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1704 return;
947 1705
948 /* 1706 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1707 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1708 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1709 */
953 if (migrate_ratelimited(numa_node_id())) 1710 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1711
956 start = mm->numa_scan_offset; 1712 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1713 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1723,32 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1723 vma = mm->mmap;
968 } 1724 }
969 for (; vma; vma = vma->vm_next) { 1725 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1726 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1727 continue;
972 1728
973 /* Skip small VMAs. They are not likely to be of relevance */ 1729 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1730 * Shared library pages mapped by multiple processes are not
1731 * migrated as it is expected they are cache replicated. Avoid
1732 * hinting faults in read-only file-backed mappings or the vdso
1733 * as migrating the pages will be of marginal benefit.
1734 */
1735 if (!vma->vm_mm ||
1736 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
975 continue; 1737 continue;
976 1738
977 do { 1739 do {
978 start = max(start, vma->vm_start); 1740 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1741 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1742 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1743 nr_pte_updates += change_prot_numa(vma, start, end);
1744
1745 /*
1746 * Scan sysctl_numa_balancing_scan_size but ensure that
1747 * at least one PTE is updated so that unused virtual
1748 * address space is quickly skipped.
1749 */
1750 if (nr_pte_updates)
1751 pages -= (end - start) >> PAGE_SHIFT;
982 1752
983 start = end; 1753 start = end;
984 if (pages <= 0) 1754 if (pages <= 0)
@@ -988,10 +1758,10 @@ void task_numa_work(struct callback_head *work)
988 1758
989out: 1759out:
990 /* 1760 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1761 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1762 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1763 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1764 * scanner to the start so check it now.
995 */ 1765 */
996 if (vma) 1766 if (vma)
997 mm->numa_scan_offset = start; 1767 mm->numa_scan_offset = start;
@@ -1025,8 +1795,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1795
1026 if (now - curr->node_stamp > period) { 1796 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1797 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1798 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1799 curr->node_stamp += period;
1030 1800
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1801 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1802 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1808,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1808static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1809{
1040} 1810}
1811
1812static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1813{
1814}
1815
1816static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1817{
1818}
1041#endif /* CONFIG_NUMA_BALANCING */ 1819#endif /* CONFIG_NUMA_BALANCING */
1042 1820
1043static void 1821static void
@@ -1047,8 +1825,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1825 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1826 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1827#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1828 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1829 struct rq *rq = rq_of(cfs_rq);
1830
1831 account_numa_enqueue(rq, task_of(se));
1832 list_add(&se->group_node, &rq->cfs_tasks);
1833 }
1052#endif 1834#endif
1053 cfs_rq->nr_running++; 1835 cfs_rq->nr_running++;
1054} 1836}
@@ -1059,8 +1841,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1841 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1842 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1843 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1844 if (entity_is_task(se)) {
1845 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1846 list_del_init(&se->group_node);
1847 }
1064 cfs_rq->nr_running--; 1848 cfs_rq->nr_running--;
1065} 1849}
1066 1850
@@ -3113,7 +3897,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3113{ 3897{
3114 struct sched_entity *se = tg->se[cpu]; 3898 struct sched_entity *se = tg->se[cpu];
3115 3899
3116 if (!tg->parent) /* the trivial, non-cgroup case */ 3900 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3117 return wl; 3901 return wl;
3118 3902
3119 for_each_sched_entity(se) { 3903 for_each_sched_entity(se) {
@@ -3166,8 +3950,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3166} 3950}
3167#else 3951#else
3168 3952
3169static inline unsigned long effective_load(struct task_group *tg, int cpu, 3953static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3170 unsigned long wl, unsigned long wg)
3171{ 3954{
3172 return wl; 3955 return wl;
3173} 3956}
@@ -3420,11 +4203,10 @@ done:
3420 * preempt must be disabled. 4203 * preempt must be disabled.
3421 */ 4204 */
3422static int 4205static int
3423select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4206select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3424{ 4207{
3425 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4208 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 int cpu = smp_processor_id(); 4209 int cpu = smp_processor_id();
3427 int prev_cpu = task_cpu(p);
3428 int new_cpu = cpu; 4210 int new_cpu = cpu;
3429 int want_affine = 0; 4211 int want_affine = 0;
3430 int sync = wake_flags & WF_SYNC; 4212 int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4686,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3904 4686
3905static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4687static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 4688
4689enum fbq_type { regular, remote, all };
4690
3907#define LBF_ALL_PINNED 0x01 4691#define LBF_ALL_PINNED 0x01
3908#define LBF_NEED_BREAK 0x02 4692#define LBF_NEED_BREAK 0x02
3909#define LBF_SOME_PINNED 0x04 4693#define LBF_DST_PINNED 0x04
4694#define LBF_SOME_PINNED 0x08
3910 4695
3911struct lb_env { 4696struct lb_env {
3912 struct sched_domain *sd; 4697 struct sched_domain *sd;
@@ -3929,6 +4714,8 @@ struct lb_env {
3929 unsigned int loop; 4714 unsigned int loop;
3930 unsigned int loop_break; 4715 unsigned int loop_break;
3931 unsigned int loop_max; 4716 unsigned int loop_max;
4717
4718 enum fbq_type fbq_type;
3932}; 4719};
3933 4720
3934/* 4721/*
@@ -3975,6 +4762,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3975 return delta < (s64)sysctl_sched_migration_cost; 4762 return delta < (s64)sysctl_sched_migration_cost;
3976} 4763}
3977 4764
4765#ifdef CONFIG_NUMA_BALANCING
4766/* Returns true if the destination node has incurred more faults */
4767static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4768{
4769 int src_nid, dst_nid;
4770
4771 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4772 !(env->sd->flags & SD_NUMA)) {
4773 return false;
4774 }
4775
4776 src_nid = cpu_to_node(env->src_cpu);
4777 dst_nid = cpu_to_node(env->dst_cpu);
4778
4779 if (src_nid == dst_nid)
4780 return false;
4781
4782 /* Always encourage migration to the preferred node. */
4783 if (dst_nid == p->numa_preferred_nid)
4784 return true;
4785
4786 /* If both task and group weight improve, this move is a winner. */
4787 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4788 group_weight(p, dst_nid) > group_weight(p, src_nid))
4789 return true;
4790
4791 return false;
4792}
4793
4794
4795static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4796{
4797 int src_nid, dst_nid;
4798
4799 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4800 return false;
4801
4802 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4803 return false;
4804
4805 src_nid = cpu_to_node(env->src_cpu);
4806 dst_nid = cpu_to_node(env->dst_cpu);
4807
4808 if (src_nid == dst_nid)
4809 return false;
4810
4811 /* Migrating away from the preferred node is always bad. */
4812 if (src_nid == p->numa_preferred_nid)
4813 return true;
4814
4815 /* If either task or group weight get worse, don't do it. */
4816 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4817 group_weight(p, dst_nid) < group_weight(p, src_nid))
4818 return true;
4819
4820 return false;
4821}
4822
4823#else
4824static inline bool migrate_improves_locality(struct task_struct *p,
4825 struct lb_env *env)
4826{
4827 return false;
4828}
4829
4830static inline bool migrate_degrades_locality(struct task_struct *p,
4831 struct lb_env *env)
4832{
4833 return false;
4834}
4835#endif
4836
3978/* 4837/*
3979 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4838 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980 */ 4839 */
@@ -3997,6 +4856,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3997 4856
3998 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4857 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 4858
4859 env->flags |= LBF_SOME_PINNED;
4860
4000 /* 4861 /*
4001 * Remember if this task can be migrated to any other cpu in 4862 * Remember if this task can be migrated to any other cpu in
4002 * our sched_group. We may want to revisit it if we couldn't 4863 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4866,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4005 * Also avoid computing new_dst_cpu if we have already computed 4866 * Also avoid computing new_dst_cpu if we have already computed
4006 * one in current iteration. 4867 * one in current iteration.
4007 */ 4868 */
4008 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4869 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4009 return 0; 4870 return 0;
4010 4871
4011 /* Prevent to re-select dst_cpu via env's cpus */ 4872 /* Prevent to re-select dst_cpu via env's cpus */
4012 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4873 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4874 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 env->flags |= LBF_SOME_PINNED; 4875 env->flags |= LBF_DST_PINNED;
4015 env->new_dst_cpu = cpu; 4876 env->new_dst_cpu = cpu;
4016 break; 4877 break;
4017 } 4878 }
@@ -4030,11 +4891,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4030 4891
4031 /* 4892 /*
4032 * Aggressive migration if: 4893 * Aggressive migration if:
4033 * 1) task is cache cold, or 4894 * 1) destination numa is preferred
4034 * 2) too many balance attempts have failed. 4895 * 2) task is cache cold, or
4896 * 3) too many balance attempts have failed.
4035 */ 4897 */
4036
4037 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4898 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4899 if (!tsk_cache_hot)
4900 tsk_cache_hot = migrate_degrades_locality(p, env);
4901
4902 if (migrate_improves_locality(p, env)) {
4903#ifdef CONFIG_SCHEDSTATS
4904 if (tsk_cache_hot) {
4905 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4906 schedstat_inc(p, se.statistics.nr_forced_migrations);
4907 }
4908#endif
4909 return 1;
4910 }
4911
4038 if (!tsk_cache_hot || 4912 if (!tsk_cache_hot ||
4039 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4913 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 4914
@@ -4077,8 +4951,6 @@ static int move_one_task(struct lb_env *env)
4077 return 0; 4951 return 0;
4078} 4952}
4079 4953
4080static unsigned long task_h_load(struct task_struct *p);
4081
4082static const unsigned int sched_nr_migrate_break = 32; 4954static const unsigned int sched_nr_migrate_break = 32;
4083 4955
4084/* 4956/*
@@ -4291,6 +5163,10 @@ struct sg_lb_stats {
4291 unsigned int group_weight; 5163 unsigned int group_weight;
4292 int group_imb; /* Is there an imbalance in the group ? */ 5164 int group_imb; /* Is there an imbalance in the group ? */
4293 int group_has_capacity; /* Is there extra capacity in the group? */ 5165 int group_has_capacity; /* Is there extra capacity in the group? */
5166#ifdef CONFIG_NUMA_BALANCING
5167 unsigned int nr_numa_running;
5168 unsigned int nr_preferred_running;
5169#endif
4294}; 5170};
4295 5171
4296/* 5172/*
@@ -4447,7 +5323,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4447{ 5323{
4448 struct sched_domain *child = sd->child; 5324 struct sched_domain *child = sd->child;
4449 struct sched_group *group, *sdg = sd->groups; 5325 struct sched_group *group, *sdg = sd->groups;
4450 unsigned long power; 5326 unsigned long power, power_orig;
4451 unsigned long interval; 5327 unsigned long interval;
4452 5328
4453 interval = msecs_to_jiffies(sd->balance_interval); 5329 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5335,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4459 return; 5335 return;
4460 } 5336 }
4461 5337
4462 power = 0; 5338 power_orig = power = 0;
4463 5339
4464 if (child->flags & SD_OVERLAP) { 5340 if (child->flags & SD_OVERLAP) {
4465 /* 5341 /*
@@ -4467,8 +5343,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
4467 * span the current group. 5343 * span the current group.
4468 */ 5344 */
4469 5345
4470 for_each_cpu(cpu, sched_group_cpus(sdg)) 5346 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4471 power += power_of(cpu); 5347 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
5348
5349 power_orig += sg->sgp->power_orig;
5350 power += sg->sgp->power;
5351 }
4472 } else { 5352 } else {
4473 /* 5353 /*
4474 * !SD_OVERLAP domains can assume that child groups 5354 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5357,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4477 5357
4478 group = child->groups; 5358 group = child->groups;
4479 do { 5359 do {
5360 power_orig += group->sgp->power_orig;
4480 power += group->sgp->power; 5361 power += group->sgp->power;
4481 group = group->next; 5362 group = group->next;
4482 } while (group != child->groups); 5363 } while (group != child->groups);
4483 } 5364 }
4484 5365
4485 sdg->sgp->power_orig = sdg->sgp->power = power; 5366 sdg->sgp->power_orig = power_orig;
5367 sdg->sgp->power = power;
4486} 5368}
4487 5369
4488/* 5370/*
@@ -4526,13 +5408,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4526 * cpu 3 and leave one of the cpus in the second group unused. 5408 * cpu 3 and leave one of the cpus in the second group unused.
4527 * 5409 *
4528 * The current solution to this issue is detecting the skew in the first group 5410 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are 5411 * by noticing the lower domain failed to reach balance and had difficulty
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see 5412 * moving tasks due to affinity constraints.
4531 * sg_imbalanced().
4532 * 5413 *
4533 * When this is so detected; this group becomes a candidate for busiest; see 5414 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and 5415 * update_sd_pick_busiest(). And calculcate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it 5416 * find_busiest_group() avoid some of the usual balance conditions to allow it
4536 * to create an effective group imbalance. 5417 * to create an effective group imbalance.
4537 * 5418 *
4538 * This is a somewhat tricky proposition since the next run might not find the 5419 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5421,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4540 * subtle and fragile situation. 5421 * subtle and fragile situation.
4541 */ 5422 */
4542 5423
4543struct sg_imb_stats { 5424static inline int sg_imbalanced(struct sched_group *group)
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{ 5425{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL; 5426 return group->sgp->imbalance;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552} 5427}
4553 5428
4554static inline void 5429/*
4555update_sg_imb_stats(struct sg_imb_stats *sgi, 5430 * Compute the group capacity.
4556 unsigned long load, unsigned long nr_running) 5431 *
5432 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5433 * first dividing out the smt factor and computing the actual number of cores
5434 * and limit power unit capacity with that.
5435 */
5436static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4557{ 5437{
4558 if (load > sgi->max_cpu_load) 5438 unsigned int capacity, smt, cpus;
4559 sgi->max_cpu_load = load; 5439 unsigned int power, power_orig;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562 5440
4563 if (nr_running > sgi->max_nr_running) 5441 power = group->sgp->power;
4564 sgi->max_nr_running = nr_running; 5442 power_orig = group->sgp->power_orig;
4565 if (sgi->min_nr_running > nr_running) 5443 cpus = group->group_weight;
4566 sgi->min_nr_running = nr_running;
4567}
4568 5444
4569static inline int 5445 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) 5446 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4571{ 5447 capacity = cpus / smt; /* cores */
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584 5448
4585 return 0; 5449 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5450 if (!capacity)
5451 capacity = fix_small_capacity(env->sd, group);
5452
5453 return capacity;
4586} 5454}
4587 5455
4588/** 5456/**
@@ -4597,12 +5465,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4597 struct sched_group *group, int load_idx, 5465 struct sched_group *group, int load_idx,
4598 int local_group, struct sg_lb_stats *sgs) 5466 int local_group, struct sg_lb_stats *sgs)
4599{ 5467{
4600 struct sg_imb_stats sgi;
4601 unsigned long nr_running; 5468 unsigned long nr_running;
4602 unsigned long load; 5469 unsigned long load;
4603 int i; 5470 int i;
4604 5471
4605 init_sg_imb_stats(&sgi); 5472 memset(sgs, 0, sizeof(*sgs));
4606 5473
4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5474 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 struct rq *rq = cpu_rq(i); 5475 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5477,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4610 nr_running = rq->nr_running; 5477 nr_running = rq->nr_running;
4611 5478
4612 /* Bias balancing toward cpus of our domain */ 5479 /* Bias balancing toward cpus of our domain */
4613 if (local_group) { 5480 if (local_group)
4614 load = target_load(i, load_idx); 5481 load = target_load(i, load_idx);
4615 } else { 5482 else
4616 load = source_load(i, load_idx); 5483 load = source_load(i, load_idx);
4617 update_sg_imb_stats(&sgi, load, nr_running);
4618 }
4619 5484
4620 sgs->group_load += load; 5485 sgs->group_load += load;
4621 sgs->sum_nr_running += nr_running; 5486 sgs->sum_nr_running += nr_running;
5487#ifdef CONFIG_NUMA_BALANCING
5488 sgs->nr_numa_running += rq->nr_numa_running;
5489 sgs->nr_preferred_running += rq->nr_preferred_running;
5490#endif
4622 sgs->sum_weighted_load += weighted_cpuload(i); 5491 sgs->sum_weighted_load += weighted_cpuload(i);
4623 if (idle_cpu(i)) 5492 if (idle_cpu(i))
4624 sgs->idle_cpus++; 5493 sgs->idle_cpus++;
4625 } 5494 }
4626 5495
4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 time_after_eq(jiffies, group->sgp->next_update)))
4629 update_group_power(env->sd, env->dst_cpu);
4630
4631 /* Adjust by relative CPU power of the group */ 5496 /* Adjust by relative CPU power of the group */
4632 sgs->group_power = group->sgp->power; 5497 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5498 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5500,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4635 if (sgs->sum_nr_running) 5500 if (sgs->sum_nr_running)
4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5501 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 5502
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639
4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642
4643 if (!sgs->group_capacity)
4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4646 sgs->group_weight = group->group_weight; 5503 sgs->group_weight = group->group_weight;
4647 5504
5505 sgs->group_imb = sg_imbalanced(group);
5506 sgs->group_capacity = sg_capacity(env, group);
5507
4648 if (sgs->group_capacity > sgs->sum_nr_running) 5508 if (sgs->group_capacity > sgs->sum_nr_running)
4649 sgs->group_has_capacity = 1; 5509 sgs->group_has_capacity = 1;
4650} 5510}
@@ -4693,14 +5553,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4693 return false; 5553 return false;
4694} 5554}
4695 5555
5556#ifdef CONFIG_NUMA_BALANCING
5557static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5558{
5559 if (sgs->sum_nr_running > sgs->nr_numa_running)
5560 return regular;
5561 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5562 return remote;
5563 return all;
5564}
5565
5566static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5567{
5568 if (rq->nr_running > rq->nr_numa_running)
5569 return regular;
5570 if (rq->nr_running > rq->nr_preferred_running)
5571 return remote;
5572 return all;
5573}
5574#else
5575static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5576{
5577 return all;
5578}
5579
5580static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5581{
5582 return regular;
5583}
5584#endif /* CONFIG_NUMA_BALANCING */
5585
4696/** 5586/**
4697 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5587 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698 * @env: The load balancing environment. 5588 * @env: The load balancing environment.
4699 * @balance: Should we balance. 5589 * @balance: Should we balance.
4700 * @sds: variable to hold the statistics for this sched_domain. 5590 * @sds: variable to hold the statistics for this sched_domain.
4701 */ 5591 */
4702static inline void update_sd_lb_stats(struct lb_env *env, 5592static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4703 struct sd_lb_stats *sds)
4704{ 5593{
4705 struct sched_domain *child = env->sd->child; 5594 struct sched_domain *child = env->sd->child;
4706 struct sched_group *sg = env->sd->groups; 5595 struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5609,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4720 if (local_group) { 5609 if (local_group) {
4721 sds->local = sg; 5610 sds->local = sg;
4722 sgs = &sds->local_stat; 5611 sgs = &sds->local_stat;
5612
5613 if (env->idle != CPU_NEWLY_IDLE ||
5614 time_after_eq(jiffies, sg->sgp->next_update))
5615 update_group_power(env->sd, env->dst_cpu);
4723 } 5616 }
4724 5617
4725 memset(sgs, 0, sizeof(*sgs));
4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 5618 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 5619
5620 if (local_group)
5621 goto next_group;
5622
4728 /* 5623 /*
4729 * In case the child domain prefers tasks go to siblings 5624 * In case the child domain prefers tasks go to siblings
4730 * first, lower the sg capacity to one so that we'll try 5625 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5630,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4735 * heaviest group when it is already under-utilized (possible 5630 * heaviest group when it is already under-utilized (possible
4736 * with a large weight task outweighs the tasks on the system). 5631 * with a large weight task outweighs the tasks on the system).
4737 */ 5632 */
4738 if (prefer_sibling && !local_group && 5633 if (prefer_sibling && sds->local &&
4739 sds->local && sds->local_stat.group_has_capacity) 5634 sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U); 5635 sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 5636
4742 /* Now, start updating sd_lb_stats */ 5637 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4743 sds->total_load += sgs->group_load;
4744 sds->total_pwr += sgs->group_power;
4745
4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 sds->busiest = sg; 5638 sds->busiest = sg;
4748 sds->busiest_stat = *sgs; 5639 sds->busiest_stat = *sgs;
4749 } 5640 }
4750 5641
5642next_group:
5643 /* Now, start updating sd_lb_stats */
5644 sds->total_load += sgs->group_load;
5645 sds->total_pwr += sgs->group_power;
5646
4751 sg = sg->next; 5647 sg = sg->next;
4752 } while (sg != env->sd->groups); 5648 } while (sg != env->sd->groups);
5649
5650 if (env->sd->flags & SD_NUMA)
5651 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4753} 5652}
4754 5653
4755/** 5654/**
@@ -5053,15 +5952,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5053 int i; 5952 int i;
5054 5953
5055 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5954 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5056 unsigned long power = power_of(i); 5955 unsigned long power, capacity, wl;
5057 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5956 enum fbq_type rt;
5058 SCHED_POWER_SCALE); 5957
5059 unsigned long wl; 5958 rq = cpu_rq(i);
5959 rt = fbq_classify_rq(rq);
5060 5960
5961 /*
5962 * We classify groups/runqueues into three groups:
5963 * - regular: there are !numa tasks
5964 * - remote: there are numa tasks that run on the 'wrong' node
5965 * - all: there is no distinction
5966 *
5967 * In order to avoid migrating ideally placed numa tasks,
5968 * ignore those when there's better options.
5969 *
5970 * If we ignore the actual busiest queue to migrate another
5971 * task, the next balance pass can still reduce the busiest
5972 * queue by moving tasks around inside the node.
5973 *
5974 * If we cannot move enough load due to this classification
5975 * the next pass will adjust the group classification and
5976 * allow migration of more tasks.
5977 *
5978 * Both cases only affect the total convergence complexity.
5979 */
5980 if (rt > env->fbq_type)
5981 continue;
5982
5983 power = power_of(i);
5984 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5061 if (!capacity) 5985 if (!capacity)
5062 capacity = fix_small_capacity(env->sd, group); 5986 capacity = fix_small_capacity(env->sd, group);
5063 5987
5064 rq = cpu_rq(i);
5065 wl = weighted_cpuload(i); 5988 wl = weighted_cpuload(i);
5066 5989
5067 /* 5990 /*
@@ -5164,6 +6087,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5164 int *continue_balancing) 6087 int *continue_balancing)
5165{ 6088{
5166 int ld_moved, cur_ld_moved, active_balance = 0; 6089 int ld_moved, cur_ld_moved, active_balance = 0;
6090 struct sched_domain *sd_parent = sd->parent;
5167 struct sched_group *group; 6091 struct sched_group *group;
5168 struct rq *busiest; 6092 struct rq *busiest;
5169 unsigned long flags; 6093 unsigned long flags;
@@ -5177,6 +6101,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5177 .idle = idle, 6101 .idle = idle,
5178 .loop_break = sched_nr_migrate_break, 6102 .loop_break = sched_nr_migrate_break,
5179 .cpus = cpus, 6103 .cpus = cpus,
6104 .fbq_type = all,
5180 }; 6105 };
5181 6106
5182 /* 6107 /*
@@ -5268,17 +6193,17 @@ more_balance:
5268 * moreover subsequent load balance cycles should correct the 6193 * moreover subsequent load balance cycles should correct the
5269 * excess load moved. 6194 * excess load moved.
5270 */ 6195 */
5271 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6196 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6197
6198 /* Prevent to re-select dst_cpu via env's cpus */
6199 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5272 6200
5273 env.dst_rq = cpu_rq(env.new_dst_cpu); 6201 env.dst_rq = cpu_rq(env.new_dst_cpu);
5274 env.dst_cpu = env.new_dst_cpu; 6202 env.dst_cpu = env.new_dst_cpu;
5275 env.flags &= ~LBF_SOME_PINNED; 6203 env.flags &= ~LBF_DST_PINNED;
5276 env.loop = 0; 6204 env.loop = 0;
5277 env.loop_break = sched_nr_migrate_break; 6205 env.loop_break = sched_nr_migrate_break;
5278 6206
5279 /* Prevent to re-select dst_cpu via env's cpus */
5280 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5281
5282 /* 6207 /*
5283 * Go back to "more_balance" rather than "redo" since we 6208 * Go back to "more_balance" rather than "redo" since we
5284 * need to continue with same src_cpu. 6209 * need to continue with same src_cpu.
@@ -5286,6 +6211,18 @@ more_balance:
5286 goto more_balance; 6211 goto more_balance;
5287 } 6212 }
5288 6213
6214 /*
6215 * We failed to reach balance because of affinity.
6216 */
6217 if (sd_parent) {
6218 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6219
6220 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6221 *group_imbalance = 1;
6222 } else if (*group_imbalance)
6223 *group_imbalance = 0;
6224 }
6225
5289 /* All tasks on this runqueue were pinned by CPU affinity */ 6226 /* All tasks on this runqueue were pinned by CPU affinity */
5290 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6227 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5291 cpumask_clear_cpu(cpu_of(busiest), cpus); 6228 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6330,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5393 struct sched_domain *sd; 6330 struct sched_domain *sd;
5394 int pulled_task = 0; 6331 int pulled_task = 0;
5395 unsigned long next_balance = jiffies + HZ; 6332 unsigned long next_balance = jiffies + HZ;
6333 u64 curr_cost = 0;
5396 6334
5397 this_rq->idle_stamp = rq_clock(this_rq); 6335 this_rq->idle_stamp = rq_clock(this_rq);
5398 6336
@@ -5409,15 +6347,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5409 for_each_domain(this_cpu, sd) { 6347 for_each_domain(this_cpu, sd) {
5410 unsigned long interval; 6348 unsigned long interval;
5411 int continue_balancing = 1; 6349 int continue_balancing = 1;
6350 u64 t0, domain_cost;
5412 6351
5413 if (!(sd->flags & SD_LOAD_BALANCE)) 6352 if (!(sd->flags & SD_LOAD_BALANCE))
5414 continue; 6353 continue;
5415 6354
6355 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6356 break;
6357
5416 if (sd->flags & SD_BALANCE_NEWIDLE) { 6358 if (sd->flags & SD_BALANCE_NEWIDLE) {
6359 t0 = sched_clock_cpu(this_cpu);
6360
5417 /* If we've pulled tasks over stop searching: */ 6361 /* If we've pulled tasks over stop searching: */
5418 pulled_task = load_balance(this_cpu, this_rq, 6362 pulled_task = load_balance(this_cpu, this_rq,
5419 sd, CPU_NEWLY_IDLE, 6363 sd, CPU_NEWLY_IDLE,
5420 &continue_balancing); 6364 &continue_balancing);
6365
6366 domain_cost = sched_clock_cpu(this_cpu) - t0;
6367 if (domain_cost > sd->max_newidle_lb_cost)
6368 sd->max_newidle_lb_cost = domain_cost;
6369
6370 curr_cost += domain_cost;
5421 } 6371 }
5422 6372
5423 interval = msecs_to_jiffies(sd->balance_interval); 6373 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6389,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5439 */ 6389 */
5440 this_rq->next_balance = next_balance; 6390 this_rq->next_balance = next_balance;
5441 } 6391 }
6392
6393 if (curr_cost > this_rq->max_idle_balance_cost)
6394 this_rq->max_idle_balance_cost = curr_cost;
5442} 6395}
5443 6396
5444/* 6397/*
@@ -5662,15 +6615,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5662 /* Earliest time when we have to do rebalance again */ 6615 /* Earliest time when we have to do rebalance again */
5663 unsigned long next_balance = jiffies + 60*HZ; 6616 unsigned long next_balance = jiffies + 60*HZ;
5664 int update_next_balance = 0; 6617 int update_next_balance = 0;
5665 int need_serialize; 6618 int need_serialize, need_decay = 0;
6619 u64 max_cost = 0;
5666 6620
5667 update_blocked_averages(cpu); 6621 update_blocked_averages(cpu);
5668 6622
5669 rcu_read_lock(); 6623 rcu_read_lock();
5670 for_each_domain(cpu, sd) { 6624 for_each_domain(cpu, sd) {
6625 /*
6626 * Decay the newidle max times here because this is a regular
6627 * visit to all the domains. Decay ~1% per second.
6628 */
6629 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6630 sd->max_newidle_lb_cost =
6631 (sd->max_newidle_lb_cost * 253) / 256;
6632 sd->next_decay_max_lb_cost = jiffies + HZ;
6633 need_decay = 1;
6634 }
6635 max_cost += sd->max_newidle_lb_cost;
6636
5671 if (!(sd->flags & SD_LOAD_BALANCE)) 6637 if (!(sd->flags & SD_LOAD_BALANCE))
5672 continue; 6638 continue;
5673 6639
6640 /*
6641 * Stop the load balance at this level. There is another
6642 * CPU in our sched group which is doing load balancing more
6643 * actively.
6644 */
6645 if (!continue_balancing) {
6646 if (need_decay)
6647 continue;
6648 break;
6649 }
6650
5674 interval = sd->balance_interval; 6651 interval = sd->balance_interval;
5675 if (idle != CPU_IDLE) 6652 if (idle != CPU_IDLE)
5676 interval *= sd->busy_factor; 6653 interval *= sd->busy_factor;
@@ -5689,7 +6666,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5689 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6666 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5690 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 6667 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5691 /* 6668 /*
5692 * The LBF_SOME_PINNED logic could have changed 6669 * The LBF_DST_PINNED logic could have changed
5693 * env->dst_cpu, so we can't know our idle 6670 * env->dst_cpu, so we can't know our idle
5694 * state even if we migrated tasks. Update it. 6671 * state even if we migrated tasks. Update it.
5695 */ 6672 */
@@ -5704,14 +6681,14 @@ out:
5704 next_balance = sd->last_balance + interval; 6681 next_balance = sd->last_balance + interval;
5705 update_next_balance = 1; 6682 update_next_balance = 1;
5706 } 6683 }
5707 6684 }
6685 if (need_decay) {
5708 /* 6686 /*
5709 * Stop the load balance at this level. There is another 6687 * Ensure the rq-wide value also decays but keep it at a
5710 * CPU in our sched group which is doing load balancing more 6688 * reasonable floor to avoid funnies with rq->avg_idle.
5711 * actively.
5712 */ 6689 */
5713 if (!continue_balancing) 6690 rq->max_idle_balance_cost =
5714 break; 6691 max((u64)sysctl_sched_migration_cost, max_cost);
5715 } 6692 }
5716 rcu_read_unlock(); 6693 rcu_read_unlock();
5717 6694
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false) 70
71/*
72 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
73 * higher number of hinting faults are recorded during active load
74 * balancing.
75 */
76SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
77
78/*
79 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
80 * lower number of hinting faults have been recorded. As this has
81 * the potential to prevent a task ever migrating to a new node
82 * due to CPU overload it is disabled by default.
83 */
84SCHED_FEAT(NUMA_RESIST_LOWER, false)
72#endif 85#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 12select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..e9304cdc26fe 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1169,13 +1169,10 @@ static void yield_task_rt(struct rq *rq)
1169static int find_lowest_rq(struct task_struct *task); 1169static int find_lowest_rq(struct task_struct *task);
1170 1170
1171static int 1171static int
1172select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1172select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1173{ 1173{
1174 struct task_struct *curr; 1174 struct task_struct *curr;
1175 struct rq *rq; 1175 struct rq *rq;
1176 int cpu;
1177
1178 cpu = task_cpu(p);
1179 1176
1180 if (p->nr_cpus_allowed == 1) 1177 if (p->nr_cpus_allowed == 1)
1181 goto out; 1178 goto out;
@@ -1213,8 +1210,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1213 */ 1210 */
1214 if (curr && unlikely(rt_task(curr)) && 1211 if (curr && unlikely(rt_task(curr)) &&
1215 (curr->nr_cpus_allowed < 2 || 1212 (curr->nr_cpus_allowed < 2 ||
1216 curr->prio <= p->prio) && 1213 curr->prio <= p->prio)) {
1217 (p->nr_cpus_allowed > 1)) {
1218 int target = find_lowest_rq(p); 1214 int target = find_lowest_rq(p);
1219 1215
1220 if (target != -1) 1216 if (target != -1)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..d69cb325c27e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h> 8#include <linux/tick.h>
9#include <linux/slab.h>
9 10
10#include "cpupri.h" 11#include "cpupri.h"
11#include "cpuacct.h" 12#include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
408 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
409 */ 410 */
410 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
411 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
412 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
413 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
476 u64 age_stamp; 481 u64 age_stamp;
477 u64 idle_stamp; 482 u64 idle_stamp;
478 u64 avg_idle; 483 u64 avg_idle;
484
485 /* This is used to determine avg_idle's max value */
486 u64 max_idle_balance_cost;
479#endif 487#endif
480 488
481#ifdef CONFIG_IRQ_TIME_ACCOUNTING 489#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
552 return rq->clock_task; 560 return rq->clock_task;
553} 561}
554 562
563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
565extern int migrate_task_to(struct task_struct *p, int cpu);
566extern int migrate_swap(struct task_struct *, struct task_struct *);
567#endif /* CONFIG_NUMA_BALANCING */
568
555#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
556 570
557#define rcu_dereference_check_sched_domain(p) \ 571#define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
593 return hsd; 607 return hsd;
594} 608}
595 609
610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
611{
612 struct sched_domain *sd;
613
614 for_each_domain(cpu, sd) {
615 if (sd->flags & flag)
616 break;
617 }
618
619 return sd;
620}
621
596DECLARE_PER_CPU(struct sched_domain *, sd_llc); 622DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size); 623DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 624DECLARE_PER_CPU(int, sd_llc_id);
625DECLARE_PER_CPU(struct sched_domain *, sd_numa);
599 626
600struct sched_group_power { 627struct sched_group_power {
601 atomic_t ref; 628 atomic_t ref;
@@ -605,6 +632,7 @@ struct sched_group_power {
605 */ 632 */
606 unsigned int power, power_orig; 633 unsigned int power, power_orig;
607 unsigned long next_update; 634 unsigned long next_update;
635 int imbalance; /* XXX unrelated to power but shared group state */
608 /* 636 /*
609 * Number of busy cpus in this group. 637 * Number of busy cpus in this group.
610 */ 638 */
@@ -719,6 +747,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
719 */ 747 */
720 smp_wmb(); 748 smp_wmb();
721 task_thread_info(p)->cpu = cpu; 749 task_thread_info(p)->cpu = cpu;
750 p->wake_cpu = cpu;
722#endif 751#endif
723} 752}
724 753
@@ -974,7 +1003,7 @@ struct sched_class {
974 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1003 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
975 1004
976#ifdef CONFIG_SMP 1005#ifdef CONFIG_SMP
977 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1006 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
978 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1007 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
979 1008
980 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1009 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c7edee71bce8..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The 59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew. 60 * delta taken on each cpu would annul the skew.
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
63{ 63{
64 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 64 unsigned long long now = rq_clock(rq), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
69 sched_info_reset_dequeued(t); 69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta; 70 t->sched_info.run_delay += delta;
71 71
72 rq_sched_info_dequeued(task_rq(t), delta); 72 rq_sched_info_dequeued(rq, delta);
73} 73}
74 74
75/* 75/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
77 * long it was waiting to run. We also note when it began so that we 77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is. 78 * can keep stats on how long its timeslice is.
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct rq *rq, struct task_struct *t)
81{ 81{
82 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 82 unsigned long long now = rq_clock(rq), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
88 t->sched_info.last_arrival = now; 88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++; 89 t->sched_info.pcount++;
90 90
91 rq_sched_info_arrive(task_rq(t), delta); 91 rq_sched_info_arrive(rq, delta);
92} 92}
93 93
94/* 94/*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
96 * the timestamp if it is already not set. It's assumed that 96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate. 97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */ 98 */
99static inline void sched_info_queued(struct task_struct *t) 99static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = rq_clock(task_rq(t)); 103 t->sched_info.last_queued = rq_clock(rq);
104} 104}
105 105
106/* 106/*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
111 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
112 * the runqueue. 112 * the runqueue.
113 */ 113 */
114static inline void sched_info_depart(struct task_struct *t) 114static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
115{ 115{
116 unsigned long long delta = rq_clock(task_rq(t)) - 116 unsigned long long delta = rq_clock(rq) -
117 t->sched_info.last_arrival; 117 t->sched_info.last_arrival;
118 118
119 rq_sched_info_depart(task_rq(t), delta); 119 rq_sched_info_depart(rq, delta);
120 120
121 if (t->state == TASK_RUNNING) 121 if (t->state == TASK_RUNNING)
122 sched_info_queued(t); 122 sched_info_queued(rq, t);
123} 123}
124 124
125/* 125/*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
128 * the idle task.) We are only called when prev != next. 128 * the idle task.) We are only called when prev != next.
129 */ 129 */
130static inline void 130static inline void
131__sched_info_switch(struct task_struct *prev, struct task_struct *next) 131__sched_info_switch(struct rq *rq,
132 struct task_struct *prev, struct task_struct *next)
132{ 133{
133 struct rq *rq = task_rq(prev);
134
135 /* 134 /*
136 * prev now departs the cpu. It's not interesting to record 135 * prev now departs the cpu. It's not interesting to record
137 * stats about how efficient we were at scheduling the idle 136 * stats about how efficient we were at scheduling the idle
138 * process, however. 137 * process, however.
139 */ 138 */
140 if (prev != rq->idle) 139 if (prev != rq->idle)
141 sched_info_depart(prev); 140 sched_info_depart(rq, prev);
142 141
143 if (next != rq->idle) 142 if (next != rq->idle)
144 sched_info_arrive(next); 143 sched_info_arrive(rq, next);
145} 144}
146static inline void 145static inline void
147sched_info_switch(struct task_struct *prev, struct task_struct *next) 146sched_info_switch(struct rq *rq,
147 struct task_struct *prev, struct task_struct *next)
148{ 148{
149 if (unlikely(sched_info_on())) 149 if (unlikely(sched_info_on()))
150 __sched_info_switch(prev, next); 150 __sched_info_switch(rq, prev, next);
151} 151}
152#else 152#else
153#define sched_info_queued(t) do { } while (0) 153#define sched_info_queued(rq, t) do { } while (0)
154#define sched_info_reset_dequeued(t) do { } while (0) 154#define sched_info_reset_dequeued(t) do { } while (0)
155#define sched_info_dequeued(t) do { } while (0) 155#define sched_info_dequeued(rq, t) do { } while (0)
156#define sched_info_switch(t, next) do { } while (0) 156#define sched_info_depart(rq, t) do { } while (0)
157#define sched_info_arrive(rq, next) do { } while (0)
158#define sched_info_switch(rq, t, next) do { } while (0)
157#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 159#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
158 160
159/* 161/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
11 11
12#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
13static int 13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15{ 15{
16 return task_cpu(p); /* stop tasks as never migrate */ 16 return task_cpu(p); /* stop tasks as never migrate */
17} 17}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7d498d8cc4f..dcab1d3fb53d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -100,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
100 100
101 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
102 /* 102 /*
103 * The preempt tracer hooks into add_preempt_count and will break 103 * The preempt tracer hooks into preempt_count_add and will break
104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
105 * is set and before current->softirq_enabled is cleared. 105 * is set and before current->softirq_enabled is cleared.
106 * We must manually increment preempt_count here and manually 106 * We must manually increment preempt_count here and manually
107 * call the trace_preempt_off later. 107 * call the trace_preempt_off later.
108 */ 108 */
109 preempt_count() += cnt; 109 __preempt_count_add(cnt);
110 /* 110 /*
111 * Were softirqs turned off above: 111 * Were softirqs turned off above:
112 */ 112 */
@@ -120,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
120#else /* !CONFIG_TRACE_IRQFLAGS */ 120#else /* !CONFIG_TRACE_IRQFLAGS */
121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
122{ 122{
123 add_preempt_count(cnt); 123 preempt_count_add(cnt);
124 barrier(); 124 barrier();
125} 125}
126#endif /* CONFIG_TRACE_IRQFLAGS */ 126#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -139,7 +139,7 @@ static void __local_bh_enable(unsigned int cnt)
139 139
140 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
141 trace_softirqs_on(_RET_IP_); 141 trace_softirqs_on(_RET_IP_);
142 sub_preempt_count(cnt); 142 preempt_count_sub(cnt);
143} 143}
144 144
145/* 145/*
@@ -169,12 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip)
169 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
170 * softirq processing: 170 * softirq processing:
171 */ 171 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 172 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
173 173
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
175 do_softirq(); 175 do_softirq();
176 176
177 dec_preempt_count(); 177 preempt_count_dec();
178#ifdef CONFIG_TRACE_IRQFLAGS 178#ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 179 local_irq_enable();
180#endif 180#endif
@@ -256,7 +256,7 @@ restart:
256 " exited with %08x?\n", vec_nr, 256 " exited with %08x?\n", vec_nr,
257 softirq_to_name[vec_nr], h->action, 257 softirq_to_name[vec_nr], h->action,
258 prev_count, preempt_count()); 258 prev_count, preempt_count());
259 preempt_count() = prev_count; 259 preempt_count_set(prev_count);
260 } 260 }
261 261
262 rcu_bh_qs(cpu); 262 rcu_bh_qs(cpu);
@@ -369,7 +369,7 @@ void irq_exit(void)
369 369
370 account_irq_exit_time(current); 370 account_irq_exit_time(current);
371 trace_hardirq_exit(); 371 trace_hardirq_exit();
372 sub_preempt_count(HARDIRQ_OFFSET); 372 preempt_count_sub(HARDIRQ_OFFSET);
373 if (!in_interrupt() && local_softirq_pending()) 373 if (!in_interrupt() && local_softirq_pending())
374 invoke_softirq(); 374 invoke_softirq();
375 375
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..32a6c44d8f78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -115,6 +115,166 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
115 return done.executed ? done.ret : -ENOENT; 115 return done.executed ? done.ret : -ENOENT;
116} 116}
117 117
118/* This controls the threads on each CPU. */
119enum multi_stop_state {
120 /* Dummy starting state for thread. */
121 MULTI_STOP_NONE,
122 /* Awaiting everyone to be scheduled. */
123 MULTI_STOP_PREPARE,
124 /* Disable interrupts. */
125 MULTI_STOP_DISABLE_IRQ,
126 /* Run the function */
127 MULTI_STOP_RUN,
128 /* Exit */
129 MULTI_STOP_EXIT,
130};
131
132struct multi_stop_data {
133 int (*fn)(void *);
134 void *data;
135 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
136 unsigned int num_threads;
137 const struct cpumask *active_cpus;
138
139 enum multi_stop_state state;
140 atomic_t thread_ack;
141};
142
143static void set_state(struct multi_stop_data *msdata,
144 enum multi_stop_state newstate)
145{
146 /* Reset ack counter. */
147 atomic_set(&msdata->thread_ack, msdata->num_threads);
148 smp_wmb();
149 msdata->state = newstate;
150}
151
152/* Last one to ack a state moves to the next state. */
153static void ack_state(struct multi_stop_data *msdata)
154{
155 if (atomic_dec_and_test(&msdata->thread_ack))
156 set_state(msdata, msdata->state + 1);
157}
158
159/* This is the cpu_stop function which stops the CPU. */
160static int multi_cpu_stop(void *data)
161{
162 struct multi_stop_data *msdata = data;
163 enum multi_stop_state curstate = MULTI_STOP_NONE;
164 int cpu = smp_processor_id(), err = 0;
165 unsigned long flags;
166 bool is_active;
167
168 /*
169 * When called from stop_machine_from_inactive_cpu(), irq might
170 * already be disabled. Save the state and restore it on exit.
171 */
172 local_save_flags(flags);
173
174 if (!msdata->active_cpus)
175 is_active = cpu == cpumask_first(cpu_online_mask);
176 else
177 is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
178
179 /* Simple state machine */
180 do {
181 /* Chill out and ensure we re-read multi_stop_state. */
182 cpu_relax();
183 if (msdata->state != curstate) {
184 curstate = msdata->state;
185 switch (curstate) {
186 case MULTI_STOP_DISABLE_IRQ:
187 local_irq_disable();
188 hard_irq_disable();
189 break;
190 case MULTI_STOP_RUN:
191 if (is_active)
192 err = msdata->fn(msdata->data);
193 break;
194 default:
195 break;
196 }
197 ack_state(msdata);
198 }
199 } while (curstate != MULTI_STOP_EXIT);
200
201 local_irq_restore(flags);
202 return err;
203}
204
205struct irq_cpu_stop_queue_work_info {
206 int cpu1;
207 int cpu2;
208 struct cpu_stop_work *work1;
209 struct cpu_stop_work *work2;
210};
211
212/*
213 * This function is always run with irqs and preemption disabled.
214 * This guarantees that both work1 and work2 get queued, before
215 * our local migrate thread gets the chance to preempt us.
216 */
217static void irq_cpu_stop_queue_work(void *arg)
218{
219 struct irq_cpu_stop_queue_work_info *info = arg;
220 cpu_stop_queue_work(info->cpu1, info->work1);
221 cpu_stop_queue_work(info->cpu2, info->work2);
222}
223
224/**
225 * stop_two_cpus - stops two cpus
226 * @cpu1: the cpu to stop
227 * @cpu2: the other cpu to stop
228 * @fn: function to execute
229 * @arg: argument to @fn
230 *
231 * Stops both the current and specified CPU and runs @fn on one of them.
232 *
233 * returns when both are completed.
234 */
235int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
236{
237 int call_cpu;
238 struct cpu_stop_done done;
239 struct cpu_stop_work work1, work2;
240 struct irq_cpu_stop_queue_work_info call_args;
241 struct multi_stop_data msdata = {
242 .fn = fn,
243 .data = arg,
244 .num_threads = 2,
245 .active_cpus = cpumask_of(cpu1),
246 };
247
248 work1 = work2 = (struct cpu_stop_work){
249 .fn = multi_cpu_stop,
250 .arg = &msdata,
251 .done = &done
252 };
253
254 call_args = (struct irq_cpu_stop_queue_work_info){
255 .cpu1 = cpu1,
256 .cpu2 = cpu2,
257 .work1 = &work1,
258 .work2 = &work2,
259 };
260
261 cpu_stop_init_done(&done, 2);
262 set_state(&msdata, MULTI_STOP_PREPARE);
263
264 /*
265 * Queuing needs to be done by the lowest numbered CPU, to ensure
266 * that works are always queued in the same order on every CPU.
267 * This prevents deadlocks.
268 */
269 call_cpu = min(cpu1, cpu2);
270
271 smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work,
272 &call_args, 0);
273
274 wait_for_completion(&done.completion);
275 return done.executed ? done.ret : -ENOENT;
276}
277
118/** 278/**
119 * stop_one_cpu_nowait - stop a cpu but don't wait for completion 279 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
120 * @cpu: cpu to stop 280 * @cpu: cpu to stop
@@ -359,98 +519,14 @@ early_initcall(cpu_stop_init);
359 519
360#ifdef CONFIG_STOP_MACHINE 520#ifdef CONFIG_STOP_MACHINE
361 521
362/* This controls the threads on each CPU. */
363enum stopmachine_state {
364 /* Dummy starting state for thread. */
365 STOPMACHINE_NONE,
366 /* Awaiting everyone to be scheduled. */
367 STOPMACHINE_PREPARE,
368 /* Disable interrupts. */
369 STOPMACHINE_DISABLE_IRQ,
370 /* Run the function */
371 STOPMACHINE_RUN,
372 /* Exit */
373 STOPMACHINE_EXIT,
374};
375
376struct stop_machine_data {
377 int (*fn)(void *);
378 void *data;
379 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
380 unsigned int num_threads;
381 const struct cpumask *active_cpus;
382
383 enum stopmachine_state state;
384 atomic_t thread_ack;
385};
386
387static void set_state(struct stop_machine_data *smdata,
388 enum stopmachine_state newstate)
389{
390 /* Reset ack counter. */
391 atomic_set(&smdata->thread_ack, smdata->num_threads);
392 smp_wmb();
393 smdata->state = newstate;
394}
395
396/* Last one to ack a state moves to the next state. */
397static void ack_state(struct stop_machine_data *smdata)
398{
399 if (atomic_dec_and_test(&smdata->thread_ack))
400 set_state(smdata, smdata->state + 1);
401}
402
403/* This is the cpu_stop function which stops the CPU. */
404static int stop_machine_cpu_stop(void *data)
405{
406 struct stop_machine_data *smdata = data;
407 enum stopmachine_state curstate = STOPMACHINE_NONE;
408 int cpu = smp_processor_id(), err = 0;
409 unsigned long flags;
410 bool is_active;
411
412 /*
413 * When called from stop_machine_from_inactive_cpu(), irq might
414 * already be disabled. Save the state and restore it on exit.
415 */
416 local_save_flags(flags);
417
418 if (!smdata->active_cpus)
419 is_active = cpu == cpumask_first(cpu_online_mask);
420 else
421 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
422
423 /* Simple state machine */
424 do {
425 /* Chill out and ensure we re-read stopmachine_state. */
426 cpu_relax();
427 if (smdata->state != curstate) {
428 curstate = smdata->state;
429 switch (curstate) {
430 case STOPMACHINE_DISABLE_IRQ:
431 local_irq_disable();
432 hard_irq_disable();
433 break;
434 case STOPMACHINE_RUN:
435 if (is_active)
436 err = smdata->fn(smdata->data);
437 break;
438 default:
439 break;
440 }
441 ack_state(smdata);
442 }
443 } while (curstate != STOPMACHINE_EXIT);
444
445 local_irq_restore(flags);
446 return err;
447}
448
449int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 522int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
450{ 523{
451 struct stop_machine_data smdata = { .fn = fn, .data = data, 524 struct multi_stop_data msdata = {
452 .num_threads = num_online_cpus(), 525 .fn = fn,
453 .active_cpus = cpus }; 526 .data = data,
527 .num_threads = num_online_cpus(),
528 .active_cpus = cpus,
529 };
454 530
455 if (!stop_machine_initialized) { 531 if (!stop_machine_initialized) {
456 /* 532 /*
@@ -461,7 +537,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
461 unsigned long flags; 537 unsigned long flags;
462 int ret; 538 int ret;
463 539
464 WARN_ON_ONCE(smdata.num_threads != 1); 540 WARN_ON_ONCE(msdata.num_threads != 1);
465 541
466 local_irq_save(flags); 542 local_irq_save(flags);
467 hard_irq_disable(); 543 hard_irq_disable();
@@ -472,8 +548,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
472 } 548 }
473 549
474 /* Set the initial state and stop all online cpus. */ 550 /* Set the initial state and stop all online cpus. */
475 set_state(&smdata, STOPMACHINE_PREPARE); 551 set_state(&msdata, MULTI_STOP_PREPARE);
476 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 552 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
477} 553}
478 554
479int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 555int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +589,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
513int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 589int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
514 const struct cpumask *cpus) 590 const struct cpumask *cpus)
515{ 591{
516 struct stop_machine_data smdata = { .fn = fn, .data = data, 592 struct multi_stop_data msdata = { .fn = fn, .data = data,
517 .active_cpus = cpus }; 593 .active_cpus = cpus };
518 struct cpu_stop_done done; 594 struct cpu_stop_done done;
519 int ret; 595 int ret;
520 596
521 /* Local CPU must be inactive and CPU hotplug in progress. */ 597 /* Local CPU must be inactive and CPU hotplug in progress. */
522 BUG_ON(cpu_active(raw_smp_processor_id())); 598 BUG_ON(cpu_active(raw_smp_processor_id()));
523 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ 599 msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
524 600
525 /* No proper task established and can't sleep - busy wait for lock. */ 601 /* No proper task established and can't sleep - busy wait for lock. */
526 while (!mutex_trylock(&stop_cpus_mutex)) 602 while (!mutex_trylock(&stop_cpus_mutex))
527 cpu_relax(); 603 cpu_relax();
528 604
529 /* Schedule work on other CPUs and execute directly for local CPU */ 605 /* Schedule work on other CPUs and execute directly for local CPU */
530 set_state(&smdata, STOPMACHINE_PREPARE); 606 set_state(&msdata, MULTI_STOP_PREPARE);
531 cpu_stop_init_done(&done, num_active_cpus()); 607 cpu_stop_init_done(&done, num_active_cpus());
532 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, 608 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
533 &done); 609 &done);
534 ret = stop_machine_cpu_stop(&smdata); 610 ret = multi_cpu_stop(&msdata);
535 611
536 /* Busy wait for completion. */ 612 /* Busy wait for completion. */
537 while (!completion_done(&done.completion)) 613 while (!completion_done(&done.completion))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2f06f3c6a3f..a159e1fd2013 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373 { 373 {
374 .procname = "numa_balancing_scan_period_reset",
375 .data = &sysctl_numa_balancing_scan_period_reset,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = proc_dointvec,
379 },
380 {
381 .procname = "numa_balancing_scan_period_max_ms", 374 .procname = "numa_balancing_scan_period_max_ms",
382 .data = &sysctl_numa_balancing_scan_period_max, 375 .data = &sysctl_numa_balancing_scan_period_max,
383 .maxlen = sizeof(unsigned int), 376 .maxlen = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 384 .mode = 0644,
392 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
393 }, 386 },
387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1093 unsigned long data) 1093 unsigned long data)
1094{ 1094{
1095 int preempt_count = preempt_count(); 1095 int count = preempt_count();
1096 1096
1097#ifdef CONFIG_LOCKDEP 1097#ifdef CONFIG_LOCKDEP
1098 /* 1098 /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1119 1119
1120 lock_map_release(&lockdep_map); 1120 lock_map_release(&lockdep_map);
1121 1121
1122 if (preempt_count != preempt_count()) { 1122 if (count != preempt_count()) {
1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1124 fn, preempt_count, preempt_count()); 1124 fn, count, preempt_count());
1125 /* 1125 /*
1126 * Restore the preempt count. That gives us a decent 1126 * Restore the preempt count. That gives us a decent
1127 * chance to survive and extract information. If the 1127 * chance to survive and extract information. If the
1128 * callback kept a lock held, bad luck, but not worse 1128 * callback kept a lock held, bad luck, but not worse
1129 * than the BUG() we had. 1129 * than the BUG() we had.
1130 */ 1130 */
1131 preempt_count() = preempt_count; 1131 preempt_count_set(count);
1132 } 1132 }
1133} 1133}
1134 1134
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6dc09d8f4c24..872a15a2a637 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
1002 * Some tests (e.g. double-unlock) might corrupt the preemption 1002 * Some tests (e.g. double-unlock) might corrupt the preemption
1003 * count, so restore it: 1003 * count, so restore it:
1004 */ 1004 */
1005 preempt_count() = saved_preempt_count; 1005 preempt_count_set(saved_preempt_count);
1006#ifdef CONFIG_TRACE_IRQFLAGS 1006#ifdef CONFIG_TRACE_IRQFLAGS
1007 if (softirq_count()) 1007 if (softirq_count())
1008 current->softirqs_enabled = 0; 1008 current->softirqs_enabled = 0;
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e51d49e..04abe53f12a1 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -9,10 +9,9 @@
9 9
10notrace unsigned int debug_smp_processor_id(void) 10notrace unsigned int debug_smp_processor_id(void)
11{ 11{
12 unsigned long preempt_count = preempt_count();
13 int this_cpu = raw_smp_processor_id(); 12 int this_cpu = raw_smp_processor_id();
14 13
15 if (likely(preempt_count)) 14 if (likely(preempt_count()))
16 goto out; 15 goto out;
17 16
18 if (irqs_disabled()) 17 if (irqs_disabled())
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884682d8..1be2a1f95b61 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1278,64 +1278,105 @@ out:
1278int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1278int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1279 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1279 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1280{ 1280{
1281 struct anon_vma *anon_vma = NULL;
1281 struct page *page; 1282 struct page *page;
1282 unsigned long haddr = addr & HPAGE_PMD_MASK; 1283 unsigned long haddr = addr & HPAGE_PMD_MASK;
1283 int target_nid; 1284 int page_nid = -1, this_nid = numa_node_id();
1284 int current_nid = -1; 1285 int target_nid, last_cpupid = -1;
1285 bool migrated; 1286 bool page_locked;
1287 bool migrated = false;
1288 int flags = 0;
1286 1289
1287 spin_lock(&mm->page_table_lock); 1290 spin_lock(&mm->page_table_lock);
1288 if (unlikely(!pmd_same(pmd, *pmdp))) 1291 if (unlikely(!pmd_same(pmd, *pmdp)))
1289 goto out_unlock; 1292 goto out_unlock;
1290 1293
1291 page = pmd_page(pmd); 1294 page = pmd_page(pmd);
1292 get_page(page); 1295 BUG_ON(is_huge_zero_page(page));
1293 current_nid = page_to_nid(page); 1296 page_nid = page_to_nid(page);
1297 last_cpupid = page_cpupid_last(page);
1294 count_vm_numa_event(NUMA_HINT_FAULTS); 1298 count_vm_numa_event(NUMA_HINT_FAULTS);
1295 if (current_nid == numa_node_id()) 1299 if (page_nid == this_nid) {
1296 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1300 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1301 flags |= TNF_FAULT_LOCAL;
1302 }
1303
1304 /*
1305 * Avoid grouping on DSO/COW pages in specific and RO pages
1306 * in general, RO pages shouldn't hurt as much anyway since
1307 * they can be in shared cache state.
1308 */
1309 if (!pmd_write(pmd))
1310 flags |= TNF_NO_GROUP;
1297 1311
1312 /*
1313 * Acquire the page lock to serialise THP migrations but avoid dropping
1314 * page_table_lock if at all possible
1315 */
1316 page_locked = trylock_page(page);
1298 target_nid = mpol_misplaced(page, vma, haddr); 1317 target_nid = mpol_misplaced(page, vma, haddr);
1299 if (target_nid == -1) { 1318 if (target_nid == -1) {
1300 put_page(page); 1319 /* If the page was locked, there are no parallel migrations */
1301 goto clear_pmdnuma; 1320 if (page_locked)
1321 goto clear_pmdnuma;
1322
1323 /*
1324 * Otherwise wait for potential migrations and retry. We do
1325 * relock and check_same as the page may no longer be mapped.
1326 * As the fault is being retried, do not account for it.
1327 */
1328 spin_unlock(&mm->page_table_lock);
1329 wait_on_page_locked(page);
1330 page_nid = -1;
1331 goto out;
1302 } 1332 }
1303 1333
1304 /* Acquire the page lock to serialise THP migrations */ 1334 /* Page is misplaced, serialise migrations and parallel THP splits */
1335 get_page(page);
1305 spin_unlock(&mm->page_table_lock); 1336 spin_unlock(&mm->page_table_lock);
1306 lock_page(page); 1337 if (!page_locked)
1338 lock_page(page);
1339 anon_vma = page_lock_anon_vma_read(page);
1307 1340
1308 /* Confirm the PTE did not while locked */ 1341 /* Confirm the PMD did not change while page_table_lock was released */
1309 spin_lock(&mm->page_table_lock); 1342 spin_lock(&mm->page_table_lock);
1310 if (unlikely(!pmd_same(pmd, *pmdp))) { 1343 if (unlikely(!pmd_same(pmd, *pmdp))) {
1311 unlock_page(page); 1344 unlock_page(page);
1312 put_page(page); 1345 put_page(page);
1346 page_nid = -1;
1313 goto out_unlock; 1347 goto out_unlock;
1314 } 1348 }
1315 spin_unlock(&mm->page_table_lock);
1316 1349
1317 /* Migrate the THP to the requested node */ 1350 /*
1351 * Migrate the THP to the requested node, returns with page unlocked
1352 * and pmd_numa cleared.
1353 */
1354 spin_unlock(&mm->page_table_lock);
1318 migrated = migrate_misplaced_transhuge_page(mm, vma, 1355 migrated = migrate_misplaced_transhuge_page(mm, vma,
1319 pmdp, pmd, addr, page, target_nid); 1356 pmdp, pmd, addr, page, target_nid);
1320 if (!migrated) 1357 if (migrated) {
1321 goto check_same; 1358 flags |= TNF_MIGRATED;
1322 1359 page_nid = target_nid;
1323 task_numa_fault(target_nid, HPAGE_PMD_NR, true); 1360 }
1324 return 0;
1325 1361
1326check_same: 1362 goto out;
1327 spin_lock(&mm->page_table_lock);
1328 if (unlikely(!pmd_same(pmd, *pmdp)))
1329 goto out_unlock;
1330clear_pmdnuma: 1363clear_pmdnuma:
1364 BUG_ON(!PageLocked(page));
1331 pmd = pmd_mknonnuma(pmd); 1365 pmd = pmd_mknonnuma(pmd);
1332 set_pmd_at(mm, haddr, pmdp, pmd); 1366 set_pmd_at(mm, haddr, pmdp, pmd);
1333 VM_BUG_ON(pmd_numa(*pmdp)); 1367 VM_BUG_ON(pmd_numa(*pmdp));
1334 update_mmu_cache_pmd(vma, addr, pmdp); 1368 update_mmu_cache_pmd(vma, addr, pmdp);
1369 unlock_page(page);
1335out_unlock: 1370out_unlock:
1336 spin_unlock(&mm->page_table_lock); 1371 spin_unlock(&mm->page_table_lock);
1337 if (current_nid != -1) 1372
1338 task_numa_fault(current_nid, HPAGE_PMD_NR, false); 1373out:
1374 if (anon_vma)
1375 page_unlock_anon_vma_read(anon_vma);
1376
1377 if (page_nid != -1)
1378 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
1379
1339 return 0; 1380 return 0;
1340} 1381}
1341 1382
@@ -1432,6 +1473,12 @@ out:
1432 return ret; 1473 return ret;
1433} 1474}
1434 1475
1476/*
1477 * Returns
1478 * - 0 if PMD could not be locked
1479 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1480 * - HPAGE_PMD_NR is protections changed and TLB flush necessary
1481 */
1435int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1482int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1436 unsigned long addr, pgprot_t newprot, int prot_numa) 1483 unsigned long addr, pgprot_t newprot, int prot_numa)
1437{ 1484{
@@ -1440,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1440 1487
1441 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1488 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1442 pmd_t entry; 1489 pmd_t entry;
1443 entry = pmdp_get_and_clear(mm, addr, pmd); 1490 ret = 1;
1444 if (!prot_numa) { 1491 if (!prot_numa) {
1492 entry = pmdp_get_and_clear(mm, addr, pmd);
1445 entry = pmd_modify(entry, newprot); 1493 entry = pmd_modify(entry, newprot);
1494 ret = HPAGE_PMD_NR;
1446 BUG_ON(pmd_write(entry)); 1495 BUG_ON(pmd_write(entry));
1447 } else { 1496 } else {
1448 struct page *page = pmd_page(*pmd); 1497 struct page *page = pmd_page(*pmd);
1449 1498
1450 /* only check non-shared pages */ 1499 /*
1451 if (page_mapcount(page) == 1 && 1500 * Do not trap faults against the zero page. The
1501 * read-only data is likely to be read-cached on the
1502 * local CPU cache and it is less useful to know about
1503 * local vs remote hits on the zero page.
1504 */
1505 if (!is_huge_zero_page(page) &&
1452 !pmd_numa(*pmd)) { 1506 !pmd_numa(*pmd)) {
1507 entry = pmdp_get_and_clear(mm, addr, pmd);
1453 entry = pmd_mknuma(entry); 1508 entry = pmd_mknuma(entry);
1509 ret = HPAGE_PMD_NR;
1454 } 1510 }
1455 } 1511 }
1456 set_pmd_at(mm, addr, pmd, entry); 1512
1513 /* Set PMD if cleared earlier */
1514 if (ret == HPAGE_PMD_NR)
1515 set_pmd_at(mm, addr, pmd, entry);
1516
1457 spin_unlock(&vma->vm_mm->page_table_lock); 1517 spin_unlock(&vma->vm_mm->page_table_lock);
1458 ret = 1;
1459 } 1518 }
1460 1519
1461 return ret; 1520 return ret;
@@ -1636,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page,
1636 page_tail->mapping = page->mapping; 1695 page_tail->mapping = page->mapping;
1637 1696
1638 page_tail->index = page->index + i; 1697 page_tail->index = page->index + i;
1639 page_nid_xchg_last(page_tail, page_nid_last(page)); 1698 page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
1640 1699
1641 BUG_ON(!PageAnon(page_tail)); 1700 BUG_ON(!PageAnon(page_tail));
1642 BUG_ON(!PageUptodate(page_tail)); 1701 BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index ca0003947115..1c7501f7fb1a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
74#endif 74#endif
75 75
76#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -2719,6 +2719,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2719 get_page(dirty_page); 2719 get_page(dirty_page);
2720 2720
2721reuse: 2721reuse:
2722 /*
2723 * Clear the pages cpupid information as the existing
2724 * information potentially belongs to a now completely
2725 * unrelated process.
2726 */
2727 if (old_page)
2728 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2729
2722 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2730 flush_cache_page(vma, address, pte_pfn(orig_pte));
2723 entry = pte_mkyoung(orig_pte); 2731 entry = pte_mkyoung(orig_pte);
2724 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2732 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3519,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3519} 3527}
3520 3528
3521int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3529int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3522 unsigned long addr, int current_nid) 3530 unsigned long addr, int page_nid,
3531 int *flags)
3523{ 3532{
3524 get_page(page); 3533 get_page(page);
3525 3534
3526 count_vm_numa_event(NUMA_HINT_FAULTS); 3535 count_vm_numa_event(NUMA_HINT_FAULTS);
3527 if (current_nid == numa_node_id()) 3536 if (page_nid == numa_node_id()) {
3528 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3537 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3538 *flags |= TNF_FAULT_LOCAL;
3539 }
3529 3540
3530 return mpol_misplaced(page, vma, addr); 3541 return mpol_misplaced(page, vma, addr);
3531} 3542}
@@ -3535,9 +3546,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3535{ 3546{
3536 struct page *page = NULL; 3547 struct page *page = NULL;
3537 spinlock_t *ptl; 3548 spinlock_t *ptl;
3538 int current_nid = -1; 3549 int page_nid = -1;
3550 int last_cpupid;
3539 int target_nid; 3551 int target_nid;
3540 bool migrated = false; 3552 bool migrated = false;
3553 int flags = 0;
3541 3554
3542 /* 3555 /*
3543 * The "pte" at this point cannot be used safely without 3556 * The "pte" at this point cannot be used safely without
@@ -3564,123 +3577,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3564 pte_unmap_unlock(ptep, ptl); 3577 pte_unmap_unlock(ptep, ptl);
3565 return 0; 3578 return 0;
3566 } 3579 }
3580 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3581
3582 /*
3583 * Avoid grouping on DSO/COW pages in specific and RO pages
3584 * in general, RO pages shouldn't hurt as much anyway since
3585 * they can be in shared cache state.
3586 */
3587 if (!pte_write(pte))
3588 flags |= TNF_NO_GROUP;
3589
3590 /*
3591 * Flag if the page is shared between multiple address spaces. This
3592 * is later used when determining whether to group tasks together
3593 */
3594 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3595 flags |= TNF_SHARED;
3567 3596
3568 current_nid = page_to_nid(page); 3597 last_cpupid = page_cpupid_last(page);
3569 target_nid = numa_migrate_prep(page, vma, addr, current_nid); 3598 page_nid = page_to_nid(page);
3599 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3570 pte_unmap_unlock(ptep, ptl); 3600 pte_unmap_unlock(ptep, ptl);
3571 if (target_nid == -1) { 3601 if (target_nid == -1) {
3572 /*
3573 * Account for the fault against the current node if it not
3574 * being replaced regardless of where the page is located.
3575 */
3576 current_nid = numa_node_id();
3577 put_page(page); 3602 put_page(page);
3578 goto out; 3603 goto out;
3579 } 3604 }
3580 3605
3581 /* Migrate to the requested node */ 3606 /* Migrate to the requested node */
3582 migrated = migrate_misplaced_page(page, target_nid); 3607 migrated = migrate_misplaced_page(page, vma, target_nid);
3583 if (migrated) 3608 if (migrated) {
3584 current_nid = target_nid; 3609 page_nid = target_nid;
3585 3610 flags |= TNF_MIGRATED;
3586out:
3587 if (current_nid != -1)
3588 task_numa_fault(current_nid, 1, migrated);
3589 return 0;
3590}
3591
3592/* NUMA hinting page fault entry point for regular pmds */
3593#ifdef CONFIG_NUMA_BALANCING
3594static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3595 unsigned long addr, pmd_t *pmdp)
3596{
3597 pmd_t pmd;
3598 pte_t *pte, *orig_pte;
3599 unsigned long _addr = addr & PMD_MASK;
3600 unsigned long offset;
3601 spinlock_t *ptl;
3602 bool numa = false;
3603 int local_nid = numa_node_id();
3604
3605 spin_lock(&mm->page_table_lock);
3606 pmd = *pmdp;
3607 if (pmd_numa(pmd)) {
3608 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3609 numa = true;
3610 }
3611 spin_unlock(&mm->page_table_lock);
3612
3613 if (!numa)
3614 return 0;
3615
3616 /* we're in a page fault so some vma must be in the range */
3617 BUG_ON(!vma);
3618 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3619 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3620 VM_BUG_ON(offset >= PMD_SIZE);
3621 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3622 pte += offset >> PAGE_SHIFT;
3623 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3624 pte_t pteval = *pte;
3625 struct page *page;
3626 int curr_nid = local_nid;
3627 int target_nid;
3628 bool migrated;
3629 if (!pte_present(pteval))
3630 continue;
3631 if (!pte_numa(pteval))
3632 continue;
3633 if (addr >= vma->vm_end) {
3634 vma = find_vma(mm, addr);
3635 /* there's a pte present so there must be a vma */
3636 BUG_ON(!vma);
3637 BUG_ON(addr < vma->vm_start);
3638 }
3639 if (pte_numa(pteval)) {
3640 pteval = pte_mknonnuma(pteval);
3641 set_pte_at(mm, addr, pte, pteval);
3642 }
3643 page = vm_normal_page(vma, addr, pteval);
3644 if (unlikely(!page))
3645 continue;
3646 /* only check non-shared pages */
3647 if (unlikely(page_mapcount(page) != 1))
3648 continue;
3649
3650 /*
3651 * Note that the NUMA fault is later accounted to either
3652 * the node that is currently running or where the page is
3653 * migrated to.
3654 */
3655 curr_nid = local_nid;
3656 target_nid = numa_migrate_prep(page, vma, addr,
3657 page_to_nid(page));
3658 if (target_nid == -1) {
3659 put_page(page);
3660 continue;
3661 }
3662
3663 /* Migrate to the requested node */
3664 pte_unmap_unlock(pte, ptl);
3665 migrated = migrate_misplaced_page(page, target_nid);
3666 if (migrated)
3667 curr_nid = target_nid;
3668 task_numa_fault(curr_nid, 1, migrated);
3669
3670 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3671 } 3611 }
3672 pte_unmap_unlock(orig_pte, ptl);
3673 3612
3613out:
3614 if (page_nid != -1)
3615 task_numa_fault(last_cpupid, page_nid, 1, flags);
3674 return 0; 3616 return 0;
3675} 3617}
3676#else
3677static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3678 unsigned long addr, pmd_t *pmdp)
3679{
3680 BUG();
3681 return 0;
3682}
3683#endif /* CONFIG_NUMA_BALANCING */
3684 3618
3685/* 3619/*
3686 * These routines also need to handle stuff like marking pages dirty 3620 * These routines also need to handle stuff like marking pages dirty
@@ -3820,8 +3754,8 @@ retry:
3820 } 3754 }
3821 } 3755 }
3822 3756
3823 if (pmd_numa(*pmd)) 3757 /* THP should already have been handled */
3824 return do_pmd_numa_page(mm, vma, address, pmd); 3758 BUG_ON(pmd_numa(*pmd));
3825 3759
3826 /* 3760 /*
3827 * Use __pte_alloc instead of pte_alloc_map, because we can't 3761 * Use __pte_alloc instead of pte_alloc_map, because we can't
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1679 return pol; 1679 return pol;
1680} 1680}
1681 1681
1682bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1683{
1684 struct mempolicy *pol = get_task_policy(task);
1685 if (vma) {
1686 if (vma->vm_ops && vma->vm_ops->get_policy) {
1687 bool ret = false;
1688
1689 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1690 if (pol && (pol->flags & MPOL_F_MOF))
1691 ret = true;
1692 mpol_cond_put(pol);
1693
1694 return ret;
1695 } else if (vma->vm_policy) {
1696 pol = vma->vm_policy;
1697 }
1698 }
1699
1700 if (!pol)
1701 return default_policy.flags & MPOL_F_MOF;
1702
1703 return pol->flags & MPOL_F_MOF;
1704}
1705
1682static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1706static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683{ 1707{
1684 enum zone_type dynamic_policy_zone = policy_zone; 1708 enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
2277 kmem_cache_free(sn_cache, n); 2301 kmem_cache_free(sn_cache, n);
2278} 2302}
2279 2303
2304#ifdef CONFIG_NUMA_BALANCING
2305static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306{
2307 /* Never defer a private fault */
2308 if (cpupid_match_pid(p, last_cpupid))
2309 return false;
2310
2311 if (p->numa_migrate_deferred) {
2312 p->numa_migrate_deferred--;
2313 return true;
2314 }
2315 return false;
2316}
2317
2318static inline void defer_numa_migrate(struct task_struct *p)
2319{
2320 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321}
2322#else
2323static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324{
2325 return false;
2326}
2327
2328static inline void defer_numa_migrate(struct task_struct *p)
2329{
2330}
2331#endif /* CONFIG_NUMA_BALANCING */
2332
2280/** 2333/**
2281 * mpol_misplaced - check whether current page node is valid in policy 2334 * mpol_misplaced - check whether current page node is valid in policy
2282 * 2335 *
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2300 struct zone *zone; 2353 struct zone *zone;
2301 int curnid = page_to_nid(page); 2354 int curnid = page_to_nid(page);
2302 unsigned long pgoff; 2355 unsigned long pgoff;
2356 int thiscpu = raw_smp_processor_id();
2357 int thisnid = cpu_to_node(thiscpu);
2303 int polnid = -1; 2358 int polnid = -1;
2304 int ret = -1; 2359 int ret = -1;
2305 2360
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2348 2403
2349 /* Migrate the page towards the node whose CPU is referencing it */ 2404 /* Migrate the page towards the node whose CPU is referencing it */
2350 if (pol->flags & MPOL_F_MORON) { 2405 if (pol->flags & MPOL_F_MORON) {
2351 int last_nid; 2406 int last_cpupid;
2407 int this_cpupid;
2352 2408
2353 polnid = numa_node_id(); 2409 polnid = thisnid;
2410 this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2354 2411
2355 /* 2412 /*
2356 * Multi-stage node selection is used in conjunction 2413 * Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2373 * it less likely we act on an unlikely task<->page 2430 * it less likely we act on an unlikely task<->page
2374 * relation. 2431 * relation.
2375 */ 2432 */
2376 last_nid = page_nid_xchg_last(page, polnid); 2433 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2377 if (last_nid != polnid) 2434 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435
2436 /* See sysctl_numa_balancing_migrate_deferred comment */
2437 if (!cpupid_match_pid(current, last_cpupid))
2438 defer_numa_migrate(current);
2439
2440 goto out;
2441 }
2442
2443 /*
2444 * The quadratic filter above reduces extraneous migration
2445 * of shared pages somewhat. This code reduces it even more,
2446 * reducing the overhead of page migrations of shared pages.
2447 * This makes workloads with shared pages rely more on
2448 * "move task near its memory", and less on "move memory
2449 * towards its task", which is exactly what we want.
2450 */
2451 if (numa_migrate_deferred(current, last_cpupid))
2378 goto out; 2452 goto out;
2379 } 2453 }
2380 2454
diff --git a/mm/migrate.c b/mm/migrate.c
index a26bccd44ccb..44c1fa9d6f54 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -443,6 +443,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
443 */ 443 */
444void migrate_page_copy(struct page *newpage, struct page *page) 444void migrate_page_copy(struct page *newpage, struct page *page)
445{ 445{
446 int cpupid;
447
446 if (PageHuge(page) || PageTransHuge(page)) 448 if (PageHuge(page) || PageTransHuge(page))
447 copy_huge_page(newpage, page); 449 copy_huge_page(newpage, page);
448 else 450 else
@@ -479,6 +481,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
479 __set_page_dirty_nobuffers(newpage); 481 __set_page_dirty_nobuffers(newpage);
480 } 482 }
481 483
484 /*
485 * Copy NUMA information to the new page, to prevent over-eager
486 * future migrations of this same page.
487 */
488 cpupid = page_cpupid_xchg_last(page, -1);
489 page_cpupid_xchg_last(newpage, cpupid);
490
482 mlock_migrate_page(newpage, page); 491 mlock_migrate_page(newpage, page);
483 ksm_migrate_page(newpage, page); 492 ksm_migrate_page(newpage, page);
484 /* 493 /*
@@ -1498,7 +1507,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1498 __GFP_NOWARN) & 1507 __GFP_NOWARN) &
1499 ~GFP_IOFS, 0); 1508 ~GFP_IOFS, 0);
1500 if (newpage) 1509 if (newpage)
1501 page_nid_xchg_last(newpage, page_nid_last(page)); 1510 page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1502 1511
1503 return newpage; 1512 return newpage;
1504} 1513}
@@ -1599,7 +1608,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1599 * node. Caller is expected to have an elevated reference count on 1608 * node. Caller is expected to have an elevated reference count on
1600 * the page that will be dropped by this function before returning. 1609 * the page that will be dropped by this function before returning.
1601 */ 1610 */
1602int migrate_misplaced_page(struct page *page, int node) 1611int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1612 int node)
1603{ 1613{
1604 pg_data_t *pgdat = NODE_DATA(node); 1614 pg_data_t *pgdat = NODE_DATA(node);
1605 int isolated; 1615 int isolated;
@@ -1607,10 +1617,11 @@ int migrate_misplaced_page(struct page *page, int node)
1607 LIST_HEAD(migratepages); 1617 LIST_HEAD(migratepages);
1608 1618
1609 /* 1619 /*
1610 * Don't migrate pages that are mapped in multiple processes. 1620 * Don't migrate file pages that are mapped in multiple processes
1611 * TODO: Handle false sharing detection instead of this hammer 1621 * with execute permissions as they are probably shared libraries.
1612 */ 1622 */
1613 if (page_mapcount(page) != 1) 1623 if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1624 (vma->vm_flags & VM_EXEC))
1614 goto out; 1625 goto out;
1615 1626
1616 /* 1627 /*
@@ -1661,13 +1672,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1661 int page_lru = page_is_file_cache(page); 1672 int page_lru = page_is_file_cache(page);
1662 1673
1663 /* 1674 /*
1664 * Don't migrate pages that are mapped in multiple processes.
1665 * TODO: Handle false sharing detection instead of this hammer
1666 */
1667 if (page_mapcount(page) != 1)
1668 goto out_dropref;
1669
1670 /*
1671 * Rate-limit the amount of data that is being migrated to a node. 1675 * Rate-limit the amount of data that is being migrated to a node.
1672 * Optimal placement is no good if the memory bus is saturated and 1676 * Optimal placement is no good if the memory bus is saturated and
1673 * all the time is being spent migrating! 1677 * all the time is being spent migrating!
@@ -1680,7 +1684,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1680 if (!new_page) 1684 if (!new_page)
1681 goto out_fail; 1685 goto out_fail;
1682 1686
1683 page_nid_xchg_last(new_page, page_nid_last(page)); 1687 page_cpupid_xchg_last(new_page, page_cpupid_last(page));
1684 1688
1685 isolated = numamigrate_isolate_page(pgdat, page); 1689 isolated = numamigrate_isolate_page(pgdat, page);
1686 if (!isolated) { 1690 if (!isolated) {
@@ -1713,12 +1717,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1713 unlock_page(new_page); 1717 unlock_page(new_page);
1714 put_page(new_page); /* Free it */ 1718 put_page(new_page); /* Free it */
1715 1719
1716 unlock_page(page); 1720 /* Retake the callers reference and putback on LRU */
1721 get_page(page);
1717 putback_lru_page(page); 1722 putback_lru_page(page);
1718 1723 mod_zone_page_state(page_zone(page),
1719 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1724 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1720 isolated = 0; 1725 goto out_fail;
1721 goto out;
1722 } 1726 }
1723 1727
1724 /* 1728 /*
@@ -1735,9 +1739,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1735 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1739 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1736 entry = pmd_mkhuge(entry); 1740 entry = pmd_mkhuge(entry);
1737 1741
1738 page_add_new_anon_rmap(new_page, vma, haddr); 1742 pmdp_clear_flush(vma, haddr, pmd);
1739
1740 set_pmd_at(mm, haddr, pmd, entry); 1743 set_pmd_at(mm, haddr, pmd, entry);
1744 page_add_new_anon_rmap(new_page, vma, haddr);
1741 update_mmu_cache_pmd(vma, address, &entry); 1745 update_mmu_cache_pmd(vma, address, &entry);
1742 page_remove_rmap(page); 1746 page_remove_rmap(page);
1743 /* 1747 /*
@@ -1756,7 +1760,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1756 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 1760 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1757 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 1761 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1758 1762
1759out:
1760 mod_zone_page_state(page_zone(page), 1763 mod_zone_page_state(page_zone(page),
1761 NR_ISOLATED_ANON + page_lru, 1764 NR_ISOLATED_ANON + page_lru,
1762 -HPAGE_PMD_NR); 1765 -HPAGE_PMD_NR);
@@ -1765,6 +1768,10 @@ out:
1765out_fail: 1768out_fail:
1766 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1769 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1767out_dropref: 1770out_dropref:
1771 entry = pmd_mknonnuma(entry);
1772 set_pmd_at(mm, haddr, pmd, entry);
1773 update_mmu_cache_pmd(vma, address, &entry);
1774
1768 unlock_page(page); 1775 unlock_page(page);
1769 put_page(page); 1776 put_page(page);
1770 return 0; 1777 return 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..68562e92d50c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
71 unsigned long or_mask, add_mask; 71 unsigned long or_mask, add_mask;
72 72
73 shift = 8 * sizeof(unsigned long); 73 shift = 8 * sizeof(unsigned long);
74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; 74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
76 "Section %d Node %d Zone %d Lastnid %d Flags %d\n", 76 "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
77 SECTIONS_WIDTH, 77 SECTIONS_WIDTH,
78 NODES_WIDTH, 78 NODES_WIDTH,
79 ZONES_WIDTH, 79 ZONES_WIDTH,
80 LAST_NID_WIDTH, 80 LAST_CPUPID_WIDTH,
81 NR_PAGEFLAGS); 81 NR_PAGEFLAGS);
82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
83 "Section %d Node %d Zone %d Lastnid %d\n", 83 "Section %d Node %d Zone %d Lastcpupid %d\n",
84 SECTIONS_SHIFT, 84 SECTIONS_SHIFT,
85 NODES_SHIFT, 85 NODES_SHIFT,
86 ZONES_SHIFT, 86 ZONES_SHIFT,
87 LAST_NID_SHIFT); 87 LAST_CPUPID_SHIFT);
88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", 88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
89 "Section %lu Node %lu Zone %lu Lastnid %lu\n", 89 "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
90 (unsigned long)SECTIONS_PGSHIFT, 90 (unsigned long)SECTIONS_PGSHIFT,
91 (unsigned long)NODES_PGSHIFT, 91 (unsigned long)NODES_PGSHIFT,
92 (unsigned long)ZONES_PGSHIFT, 92 (unsigned long)ZONES_PGSHIFT,
93 (unsigned long)LAST_NID_PGSHIFT); 93 (unsigned long)LAST_CPUPID_PGSHIFT);
94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", 94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
95 "Node/Zone ID: %lu -> %lu\n", 95 "Node/Zone ID: %lu -> %lu\n",
96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), 96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
103 "Node not in page flags"); 103 "Node not in page flags");
104#endif 104#endif
105#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 105#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
107 "Last nid not in page flags"); 107 "Last cpupid not in page flags");
108#endif 108#endif
109 109
110 if (SECTIONS_WIDTH) { 110 if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..bf34fb8556db 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99 99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) 100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid) 101int page_cpupid_xchg_last(struct page *page, int cpupid)
102{ 102{
103 unsigned long old_flags, flags; 103 unsigned long old_flags, flags;
104 int last_nid; 104 int last_cpupid;
105 105
106 do { 106 do {
107 old_flags = flags = page->flags; 107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page); 108 last_cpupid = page_cpupid_last(page);
109 109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 110 flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 111 flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); 112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113 113
114 return last_nid; 114 return last_cpupid;
115} 115}
116#endif 116#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b43..a0302ac0be98 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
37 37
38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable, int prot_numa, bool *ret_all_same_node) 40 int dirty_accountable, int prot_numa)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm; 42 struct mm_struct *mm = vma->vm_mm;
43 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
44 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0; 45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
48 46
49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 47 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
50 arch_enter_lazy_mmu_mode(); 48 arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 61
64 page = vm_normal_page(vma, addr, oldpte); 62 page = vm_normal_page(vma, addr, oldpte);
65 if (page) { 63 if (page) {
66 int this_nid = page_to_nid(page); 64 if (!pte_numa(oldpte)) {
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent); 65 ptent = pte_mknuma(ptent);
76 updated = true; 66 updated = true;
77 } 67 }
@@ -101,33 +91,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
101 make_migration_entry_read(&entry); 91 make_migration_entry_read(&entry);
102 set_pte_at(mm, addr, pte, 92 set_pte_at(mm, addr, pte,
103 swp_entry_to_pte(entry)); 93 swp_entry_to_pte(entry));
94
95 pages++;
104 } 96 }
105 pages++;
106 } 97 }
107 } while (pte++, addr += PAGE_SIZE, addr != end); 98 } while (pte++, addr += PAGE_SIZE, addr != end);
108 arch_leave_lazy_mmu_mode(); 99 arch_leave_lazy_mmu_mode();
109 pte_unmap_unlock(pte - 1, ptl); 100 pte_unmap_unlock(pte - 1, ptl);
110 101
111 *ret_all_same_node = all_same_node;
112 return pages; 102 return pages;
113} 103}
114 104
115#ifdef CONFIG_NUMA_BALANCING
116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma, 105static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
132 pud_t *pud, unsigned long addr, unsigned long end, 106 pud_t *pud, unsigned long addr, unsigned long end,
133 pgprot_t newprot, int dirty_accountable, int prot_numa) 107 pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -135,34 +109,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
135 pmd_t *pmd; 109 pmd_t *pmd;
136 unsigned long next; 110 unsigned long next;
137 unsigned long pages = 0; 111 unsigned long pages = 0;
138 bool all_same_node;
139 112
140 pmd = pmd_offset(pud, addr); 113 pmd = pmd_offset(pud, addr);
141 do { 114 do {
115 unsigned long this_pages;
116
142 next = pmd_addr_end(addr, end); 117 next = pmd_addr_end(addr, end);
143 if (pmd_trans_huge(*pmd)) { 118 if (pmd_trans_huge(*pmd)) {
144 if (next - addr != HPAGE_PMD_SIZE) 119 if (next - addr != HPAGE_PMD_SIZE)
145 split_huge_page_pmd(vma, addr, pmd); 120 split_huge_page_pmd(vma, addr, pmd);
146 else if (change_huge_pmd(vma, pmd, addr, newprot, 121 else {
147 prot_numa)) { 122 int nr_ptes = change_huge_pmd(vma, pmd, addr,
148 pages += HPAGE_PMD_NR; 123 newprot, prot_numa);
149 continue; 124
125 if (nr_ptes) {
126 if (nr_ptes == HPAGE_PMD_NR)
127 pages++;
128
129 continue;
130 }
150 } 131 }
151 /* fall through */ 132 /* fall through */
152 } 133 }
153 if (pmd_none_or_clear_bad(pmd)) 134 if (pmd_none_or_clear_bad(pmd))
154 continue; 135 continue;
155 pages += change_pte_range(vma, pmd, addr, next, newprot, 136 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
156 dirty_accountable, prot_numa, &all_same_node); 137 dirty_accountable, prot_numa);
157 138 pages += this_pages;
158 /*
159 * If we are changing protections for NUMA hinting faults then
160 * set pmd_numa if the examined pages were all on the same
161 * node. This allows a regular PMD to be handled as one fault
162 * and effectively batches the taking of the PTL
163 */
164 if (prot_numa && all_same_node)
165 change_pmd_protnuma(vma->vm_mm, addr, pmd);
166 } while (pmd++, addr = next, addr != end); 139 } while (pmd++, addr = next, addr != end);
167 140
168 return pages; 141 return pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fac451a..73d812f16dde 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
626 bad_page(page); 626 bad_page(page);
627 return 1; 627 return 1;
628 } 628 }
629 page_nid_reset_last(page); 629 page_cpupid_reset_last(page);
630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
632 return 0; 632 return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4015 mminit_verify_page_links(page, zone, nid, pfn); 4015 mminit_verify_page_links(page, zone, nid, pfn);
4016 init_page_count(page); 4016 init_page_count(page);
4017 page_mapcount_reset(page); 4017 page_mapcount_reset(page);
4018 page_nid_reset_last(page); 4018 page_cpupid_reset_last(page);
4019 SetPageReserved(page); 4019 SetPageReserved(page);
4020 /* 4020 /*
4021 * Mark the block movable so that blocks are reserved for 4021 * Mark the block movable so that blocks are reserved for
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 0578d4fa00a9..0f676908d15b 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2563,9 +2563,8 @@ bed:
2563 jiffies + msecs_to_jiffies(val)); 2563 jiffies + msecs_to_jiffies(val));
2564 2564
2565 /* Wait for IR-LMP to call us back */ 2565 /* Wait for IR-LMP to call us back */
2566 __wait_event_interruptible(self->query_wait, 2566 err = __wait_event_interruptible(self->query_wait,
2567 (self->cachedaddr != 0 || self->errno == -ETIME), 2567 (self->cachedaddr != 0 || self->errno == -ETIME));
2568 err);
2569 2568
2570 /* If watchdog is still activated, kill it! */ 2569 /* If watchdog is still activated, kill it! */
2571 del_timer(&(self->watchdog)); 2570 del_timer(&(self->watchdog));
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index f4484719f3e6..f63c2388f38d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1637,12 +1637,9 @@ static int sync_thread_master(void *data)
1637 continue; 1637 continue;
1638 } 1638 }
1639 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1639 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1640 int ret = 0; 1640 int ret = __wait_event_interruptible(*sk_sleep(sk),
1641
1642 __wait_event_interruptible(*sk_sleep(sk),
1643 sock_writeable(sk) || 1641 sock_writeable(sk) ||
1644 kthread_should_stop(), 1642 kthread_should_stop());
1645 ret);
1646 if (unlikely(kthread_should_stop())) 1643 if (unlikely(kthread_should_stop()))
1647 goto done; 1644 goto done;
1648 } 1645 }