diff options
107 files changed, 3008 insertions, 1189 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 9d4c1d18ad44..4273b2d71a27 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -355,6 +355,82 @@ utilize. | |||
355 | 355 | ||
356 | ============================================================== | 356 | ============================================================== |
357 | 357 | ||
358 | numa_balancing | ||
359 | |||
360 | Enables/disables automatic page fault based NUMA memory | ||
361 | balancing. Memory is moved automatically to nodes | ||
362 | that access it often. | ||
363 | |||
364 | Enables/disables automatic NUMA memory balancing. On NUMA machines, there | ||
365 | is a performance penalty if remote memory is accessed by a CPU. When this | ||
366 | feature is enabled the kernel samples what task thread is accessing memory | ||
367 | by periodically unmapping pages and later trapping a page fault. At the | ||
368 | time of the page fault, it is determined if the data being accessed should | ||
369 | be migrated to a local memory node. | ||
370 | |||
371 | The unmapping of pages and trapping faults incur additional overhead that | ||
372 | ideally is offset by improved memory locality but there is no universal | ||
373 | guarantee. If the target workload is already bound to NUMA nodes then this | ||
374 | feature should be disabled. Otherwise, if the system overhead from the | ||
375 | feature is too high then the rate the kernel samples for NUMA hinting | ||
376 | faults may be controlled by the numa_balancing_scan_period_min_ms, | ||
377 | numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, | ||
378 | numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and | ||
379 | numa_balancing_migrate_deferred. | ||
380 | |||
381 | ============================================================== | ||
382 | |||
383 | numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, | ||
384 | numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb | ||
385 | |||
386 | Automatic NUMA balancing scans tasks address space and unmaps pages to | ||
387 | detect if pages are properly placed or if the data should be migrated to a | ||
388 | memory node local to where the task is running. Every "scan delay" the task | ||
389 | scans the next "scan size" number of pages in its address space. When the | ||
390 | end of the address space is reached the scanner restarts from the beginning. | ||
391 | |||
392 | In combination, the "scan delay" and "scan size" determine the scan rate. | ||
393 | When "scan delay" decreases, the scan rate increases. The scan delay and | ||
394 | hence the scan rate of every task is adaptive and depends on historical | ||
395 | behaviour. If pages are properly placed then the scan delay increases, | ||
396 | otherwise the scan delay decreases. The "scan size" is not adaptive but | ||
397 | the higher the "scan size", the higher the scan rate. | ||
398 | |||
399 | Higher scan rates incur higher system overhead as page faults must be | ||
400 | trapped and potentially data must be migrated. However, the higher the scan | ||
401 | rate, the more quickly a tasks memory is migrated to a local node if the | ||
402 | workload pattern changes and minimises performance impact due to remote | ||
403 | memory accesses. These sysctls control the thresholds for scan delays and | ||
404 | the number of pages scanned. | ||
405 | |||
406 | numa_balancing_scan_period_min_ms is the minimum time in milliseconds to | ||
407 | scan a tasks virtual memory. It effectively controls the maximum scanning | ||
408 | rate for each task. | ||
409 | |||
410 | numa_balancing_scan_delay_ms is the starting "scan delay" used for a task | ||
411 | when it initially forks. | ||
412 | |||
413 | numa_balancing_scan_period_max_ms is the maximum time in milliseconds to | ||
414 | scan a tasks virtual memory. It effectively controls the minimum scanning | ||
415 | rate for each task. | ||
416 | |||
417 | numa_balancing_scan_size_mb is how many megabytes worth of pages are | ||
418 | scanned for a given scan. | ||
419 | |||
420 | numa_balancing_settle_count is how many scan periods must complete before | ||
421 | the schedule balancer stops pushing the task towards a preferred node. This | ||
422 | gives the scheduler a chance to place the task on an alternative node if the | ||
423 | preferred node is overloaded. | ||
424 | |||
425 | numa_balancing_migrate_deferred is how many page migrations get skipped | ||
426 | unconditionally, after a page migration is skipped because a page is shared | ||
427 | with other tasks. This reduces page migration overhead, and determines | ||
428 | how much stronger the "move task near its memory" policy scheduler becomes, | ||
429 | versus the "move memory near its task" memory management policy, for workloads | ||
430 | with shared memory. | ||
431 | |||
432 | ============================================================== | ||
433 | |||
358 | osrelease, ostype & version: | 434 | osrelease, ostype & version: |
359 | 435 | ||
360 | # cat osrelease | 436 | # cat osrelease |
diff --git a/MAINTAINERS b/MAINTAINERS index 8a0cbf3cf2c8..aee6733391cb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -7282,6 +7282,8 @@ S: Maintained | |||
7282 | F: kernel/sched/ | 7282 | F: kernel/sched/ |
7283 | F: include/linux/sched.h | 7283 | F: include/linux/sched.h |
7284 | F: include/uapi/linux/sched.h | 7284 | F: include/uapi/linux/sched.h |
7285 | F: kernel/wait.c | ||
7286 | F: include/linux/wait.h | ||
7285 | 7287 | ||
7286 | SCORE ARCHITECTURE | 7288 | SCORE ARCHITECTURE |
7287 | M: Chen Liqin <liqin.linux@gmail.com> | 7289 | M: Chen Liqin <liqin.linux@gmail.com> |
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index a6e85f448c1c..f01fb505ad52 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild | |||
@@ -3,3 +3,4 @@ generic-y += clkdev.h | |||
3 | 3 | ||
4 | generic-y += exec.h | 4 | generic-y += exec.h |
5 | generic-y += trace_clock.h | 5 | generic-y += trace_clock.h |
6 | generic-y += preempt.h | ||
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index d8dd660898b9..5943f7f9d325 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild | |||
@@ -46,3 +46,4 @@ generic-y += ucontext.h | |||
46 | generic-y += user.h | 46 | generic-y += user.h |
47 | generic-y += vga.h | 47 | generic-y += vga.h |
48 | generic-y += xor.h | 48 | generic-y += xor.h |
49 | generic-y += preempt.h | ||
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index d3db39860b9c..4e6838d4ddf6 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild | |||
@@ -33,3 +33,4 @@ generic-y += timex.h | |||
33 | generic-y += trace_clock.h | 33 | generic-y += trace_clock.h |
34 | generic-y += types.h | 34 | generic-y += types.h |
35 | generic-y += unaligned.h | 35 | generic-y += unaligned.h |
36 | generic-y += preempt.h | ||
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 79a642d199f2..519f89f5b6a3 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild | |||
@@ -50,3 +50,4 @@ generic-y += unaligned.h | |||
50 | generic-y += user.h | 50 | generic-y += user.h |
51 | generic-y += vga.h | 51 | generic-y += vga.h |
52 | generic-y += xor.h | 52 | generic-y += xor.h |
53 | generic-y += preempt.h | ||
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index fd7980743890..658001b52400 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild | |||
@@ -7,6 +7,7 @@ generic-y += div64.h | |||
7 | generic-y += emergency-restart.h | 7 | generic-y += emergency-restart.h |
8 | generic-y += exec.h | 8 | generic-y += exec.h |
9 | generic-y += futex.h | 9 | generic-y += futex.h |
10 | generic-y += preempt.h | ||
10 | generic-y += irq_regs.h | 11 | generic-y += irq_regs.h |
11 | generic-y += param.h | 12 | generic-y += param.h |
12 | generic-y += local.h | 13 | generic-y += local.h |
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 127826f8a375..f2b43474b0e2 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild | |||
@@ -44,3 +44,4 @@ generic-y += ucontext.h | |||
44 | generic-y += unaligned.h | 44 | generic-y += unaligned.h |
45 | generic-y += user.h | 45 | generic-y += user.h |
46 | generic-y += xor.h | 46 | generic-y += xor.h |
47 | generic-y += preempt.h | ||
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index e49f918531ad..fc0b3c356027 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild | |||
@@ -56,3 +56,4 @@ generic-y += ucontext.h | |||
56 | generic-y += user.h | 56 | generic-y += user.h |
57 | generic-y += vga.h | 57 | generic-y += vga.h |
58 | generic-y += xor.h | 58 | generic-y += xor.h |
59 | generic-y += preempt.h | ||
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index c8325455520e..b06caf649a95 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild | |||
@@ -11,3 +11,4 @@ generic-y += module.h | |||
11 | generic-y += trace_clock.h | 11 | generic-y += trace_clock.h |
12 | generic-y += vga.h | 12 | generic-y += vga.h |
13 | generic-y += xor.h | 13 | generic-y += xor.h |
14 | generic-y += preempt.h | ||
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index c5d767028306..74742dc6a3da 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild | |||
@@ -2,3 +2,4 @@ | |||
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += exec.h | 3 | generic-y += exec.h |
4 | generic-y += trace_clock.h | 4 | generic-y += trace_clock.h |
5 | generic-y += preempt.h | ||
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 8ada3cf0c98d..7e0e7213a481 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild | |||
@@ -6,3 +6,4 @@ generic-y += mmu.h | |||
6 | generic-y += module.h | 6 | generic-y += module.h |
7 | generic-y += trace_clock.h | 7 | generic-y += trace_clock.h |
8 | generic-y += xor.h | 8 | generic-y += xor.h |
9 | generic-y += preempt.h | ||
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 1da17caac23c..67c3450309b7 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild | |||
@@ -53,3 +53,4 @@ generic-y += types.h | |||
53 | generic-y += ucontext.h | 53 | generic-y += ucontext.h |
54 | generic-y += unaligned.h | 54 | generic-y += unaligned.h |
55 | generic-y += xor.h | 55 | generic-y += xor.h |
56 | generic-y += preempt.h | ||
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index a3456f34f672..f93ee087e8fe 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild | |||
@@ -3,4 +3,5 @@ generic-y += clkdev.h | |||
3 | generic-y += exec.h | 3 | generic-y += exec.h |
4 | generic-y += kvm_para.h | 4 | generic-y += kvm_para.h |
5 | generic-y += trace_clock.h | 5 | generic-y += trace_clock.h |
6 | generic-y += preempt.h | ||
6 | generic-y += vtime.h \ No newline at end of file | 7 | generic-y += vtime.h \ No newline at end of file |
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index bebdc36ebb0a..2b58c5f0bc38 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild | |||
@@ -3,3 +3,4 @@ generic-y += clkdev.h | |||
3 | generic-y += exec.h | 3 | generic-y += exec.h |
4 | generic-y += module.h | 4 | generic-y += module.h |
5 | generic-y += trace_clock.h | 5 | generic-y += trace_clock.h |
6 | generic-y += preempt.h | ||
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 09d77a862da3..a5d27f272a59 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild | |||
@@ -31,3 +31,4 @@ generic-y += trace_clock.h | |||
31 | generic-y += types.h | 31 | generic-y += types.h |
32 | generic-y += word-at-a-time.h | 32 | generic-y += word-at-a-time.h |
33 | generic-y += xor.h | 33 | generic-y += xor.h |
34 | generic-y += preempt.h | ||
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild index 6ae0ccb632cb..84d0c1d6b9b3 100644 --- a/arch/metag/include/asm/Kbuild +++ b/arch/metag/include/asm/Kbuild | |||
@@ -52,3 +52,4 @@ generic-y += unaligned.h | |||
52 | generic-y += user.h | 52 | generic-y += user.h |
53 | generic-y += vga.h | 53 | generic-y += vga.h |
54 | generic-y += xor.h | 54 | generic-y += xor.h |
55 | generic-y += preempt.h | ||
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index 23f5118f58db..8e9c0b3b9691 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h | |||
@@ -26,6 +26,8 @@ | |||
26 | .last_balance = jiffies, \ | 26 | .last_balance = jiffies, \ |
27 | .balance_interval = 1, \ | 27 | .balance_interval = 1, \ |
28 | .nr_balance_failed = 0, \ | 28 | .nr_balance_failed = 0, \ |
29 | .max_newidle_lb_cost = 0, \ | ||
30 | .next_decay_max_lb_cost = jiffies, \ | ||
29 | } | 31 | } |
30 | 32 | ||
31 | #define cpu_to_node(cpu) ((void)(cpu), 0) | 33 | #define cpu_to_node(cpu) ((void)(cpu), 0) |
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index d3c51a6a601d..ce0bbf8f5640 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild | |||
@@ -3,3 +3,4 @@ generic-y += clkdev.h | |||
3 | generic-y += exec.h | 3 | generic-y += exec.h |
4 | generic-y += trace_clock.h | 4 | generic-y += trace_clock.h |
5 | generic-y += syscalls.h | 5 | generic-y += syscalls.h |
6 | generic-y += preempt.h | ||
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 454ddf9bb76f..1acbb8b77a71 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild | |||
@@ -11,5 +11,6 @@ generic-y += sections.h | |||
11 | generic-y += segment.h | 11 | generic-y += segment.h |
12 | generic-y += serial.h | 12 | generic-y += serial.h |
13 | generic-y += trace_clock.h | 13 | generic-y += trace_clock.h |
14 | generic-y += preempt.h | ||
14 | generic-y += ucontext.h | 15 | generic-y += ucontext.h |
15 | generic-y += xor.h | 16 | generic-y += xor.h |
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c index d763f11e35e2..2c12ea1668d1 100644 --- a/arch/mips/kernel/rtlx.c +++ b/arch/mips/kernel/rtlx.c | |||
@@ -172,8 +172,9 @@ int rtlx_open(int index, int can_sleep) | |||
172 | if (rtlx == NULL) { | 172 | if (rtlx == NULL) { |
173 | if( (p = vpe_get_shared(tclimit)) == NULL) { | 173 | if( (p = vpe_get_shared(tclimit)) == NULL) { |
174 | if (can_sleep) { | 174 | if (can_sleep) { |
175 | __wait_event_interruptible(channel_wqs[index].lx_queue, | 175 | ret = __wait_event_interruptible( |
176 | (p = vpe_get_shared(tclimit)), ret); | 176 | channel_wqs[index].lx_queue, |
177 | (p = vpe_get_shared(tclimit))); | ||
177 | if (ret) | 178 | if (ret) |
178 | goto out_fail; | 179 | goto out_fail; |
179 | } else { | 180 | } else { |
@@ -263,11 +264,10 @@ unsigned int rtlx_read_poll(int index, int can_sleep) | |||
263 | /* data available to read? */ | 264 | /* data available to read? */ |
264 | if (chan->lx_read == chan->lx_write) { | 265 | if (chan->lx_read == chan->lx_write) { |
265 | if (can_sleep) { | 266 | if (can_sleep) { |
266 | int ret = 0; | 267 | int ret = __wait_event_interruptible( |
267 | 268 | channel_wqs[index].lx_queue, | |
268 | __wait_event_interruptible(channel_wqs[index].lx_queue, | ||
269 | (chan->lx_read != chan->lx_write) || | 269 | (chan->lx_read != chan->lx_write) || |
270 | sp_stopping, ret); | 270 | sp_stopping); |
271 | if (ret) | 271 | if (ret) |
272 | return ret; | 272 | return ret; |
273 | 273 | ||
@@ -440,14 +440,13 @@ static ssize_t file_write(struct file *file, const char __user * buffer, | |||
440 | 440 | ||
441 | /* any space left... */ | 441 | /* any space left... */ |
442 | if (!rtlx_write_poll(minor)) { | 442 | if (!rtlx_write_poll(minor)) { |
443 | int ret = 0; | 443 | int ret; |
444 | 444 | ||
445 | if (file->f_flags & O_NONBLOCK) | 445 | if (file->f_flags & O_NONBLOCK) |
446 | return -EAGAIN; | 446 | return -EAGAIN; |
447 | 447 | ||
448 | __wait_event_interruptible(channel_wqs[minor].rt_queue, | 448 | ret = __wait_event_interruptible(channel_wqs[minor].rt_queue, |
449 | rtlx_write_poll(minor), | 449 | rtlx_write_poll(minor)); |
450 | ret); | ||
451 | if (ret) | 450 | if (ret) |
452 | return ret; | 451 | return ret; |
453 | } | 452 | } |
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index e205ef598e97..12156176c7ca 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c | |||
@@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr) | |||
124 | 124 | ||
125 | BUG_ON(Page_dcache_dirty(page)); | 125 | BUG_ON(Page_dcache_dirty(page)); |
126 | 126 | ||
127 | inc_preempt_count(); | 127 | pagefault_disable(); |
128 | idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); | 128 | idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); |
129 | #ifdef CONFIG_MIPS_MT_SMTC | 129 | #ifdef CONFIG_MIPS_MT_SMTC |
130 | idx += FIX_N_COLOURS * smp_processor_id() + | 130 | idx += FIX_N_COLOURS * smp_processor_id() + |
@@ -193,8 +193,7 @@ void kunmap_coherent(void) | |||
193 | write_c0_entryhi(old_ctx); | 193 | write_c0_entryhi(old_ctx); |
194 | EXIT_CRITICAL(flags); | 194 | EXIT_CRITICAL(flags); |
195 | #endif | 195 | #endif |
196 | dec_preempt_count(); | 196 | pagefault_enable(); |
197 | preempt_check_resched(); | ||
198 | } | 197 | } |
199 | 198 | ||
200 | void copy_user_highpage(struct page *to, struct page *from, | 199 | void copy_user_highpage(struct page *to, struct page *from, |
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index c5d767028306..74742dc6a3da 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild | |||
@@ -2,3 +2,4 @@ | |||
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += exec.h | 3 | generic-y += exec.h |
4 | generic-y += trace_clock.h | 4 | generic-y += trace_clock.h |
5 | generic-y += preempt.h | ||
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 195653e851da..78405625e799 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild | |||
@@ -67,3 +67,4 @@ generic-y += ucontext.h | |||
67 | generic-y += user.h | 67 | generic-y += user.h |
68 | generic-y += word-at-a-time.h | 68 | generic-y += word-at-a-time.h |
69 | generic-y += xor.h | 69 | generic-y += xor.h |
70 | generic-y += preempt.h | ||
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index ff4c9faed546..a603b9ebe54c 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild | |||
@@ -4,3 +4,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \ | |||
4 | div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ | 4 | div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ |
5 | poll.h xor.h clkdev.h exec.h | 5 | poll.h xor.h clkdev.h exec.h |
6 | generic-y += trace_clock.h | 6 | generic-y += trace_clock.h |
7 | generic-y += preempt.h | ||
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 704e6f10ae80..d8f9d2f18a23 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild | |||
@@ -2,4 +2,5 @@ | |||
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += rwsem.h | 3 | generic-y += rwsem.h |
4 | generic-y += trace_clock.h | 4 | generic-y += trace_clock.h |
5 | generic-y += preempt.h | ||
5 | generic-y += vtime.h \ No newline at end of file | 6 | generic-y += vtime.h \ No newline at end of file |
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index f313f9cbcf44..7a5288f3479a 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild | |||
@@ -2,3 +2,4 @@ | |||
2 | 2 | ||
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += trace_clock.h | 4 | generic-y += trace_clock.h |
5 | generic-y += preempt.h | ||
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index e1c7bb999b06..f3414ade77a3 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild | |||
@@ -4,3 +4,4 @@ header-y += | |||
4 | generic-y += clkdev.h | 4 | generic-y += clkdev.h |
5 | generic-y += trace_clock.h | 5 | generic-y += trace_clock.h |
6 | generic-y += xor.h | 6 | generic-y += xor.h |
7 | generic-y += preempt.h | ||
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 280bea9e5e2b..231efbb68108 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild | |||
@@ -34,3 +34,4 @@ generic-y += termios.h | |||
34 | generic-y += trace_clock.h | 34 | generic-y += trace_clock.h |
35 | generic-y += ucontext.h | 35 | generic-y += ucontext.h |
36 | generic-y += xor.h | 36 | generic-y += xor.h |
37 | generic-y += preempt.h | ||
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 7e4a97fbded4..bf390667657a 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild | |||
@@ -16,3 +16,4 @@ generic-y += serial.h | |||
16 | generic-y += trace_clock.h | 16 | generic-y += trace_clock.h |
17 | generic-y += types.h | 17 | generic-y += types.h |
18 | generic-y += word-at-a-time.h | 18 | generic-y += word-at-a-time.h |
19 | generic-y += preempt.h | ||
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 664d6ad23f80..22f3bd147fa7 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild | |||
@@ -38,3 +38,4 @@ generic-y += termios.h | |||
38 | generic-y += trace_clock.h | 38 | generic-y += trace_clock.h |
39 | generic-y += types.h | 39 | generic-y += types.h |
40 | generic-y += xor.h | 40 | generic-y += xor.h |
41 | generic-y += preempt.h | ||
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index b30f34a79882..fdde187e6087 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild | |||
@@ -3,3 +3,4 @@ generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h | |||
3 | generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h | 3 | generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h |
4 | generic-y += switch_to.h clkdev.h | 4 | generic-y += switch_to.h clkdev.h |
5 | generic-y += trace_clock.h | 5 | generic-y += trace_clock.h |
6 | generic-y += preempt.h | ||
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index 89d8b6c4e39a..00045cbe5c63 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild | |||
@@ -60,3 +60,4 @@ generic-y += unaligned.h | |||
60 | generic-y += user.h | 60 | generic-y += user.h |
61 | generic-y += vga.h | 61 | generic-y += vga.h |
62 | generic-y += xor.h | 62 | generic-y += xor.h |
63 | generic-y += preempt.h | ||
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 722aa3b04624..da31c8b8a92d 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <asm/processor.h> | 6 | #include <asm/processor.h> |
7 | #include <asm/alternative.h> | 7 | #include <asm/alternative.h> |
8 | #include <asm/cmpxchg.h> | 8 | #include <asm/cmpxchg.h> |
9 | #include <asm/rmwcc.h> | ||
9 | 10 | ||
10 | /* | 11 | /* |
11 | * Atomic operations that C can't guarantee us. Useful for | 12 | * Atomic operations that C can't guarantee us. Useful for |
@@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v) | |||
76 | */ | 77 | */ |
77 | static inline int atomic_sub_and_test(int i, atomic_t *v) | 78 | static inline int atomic_sub_and_test(int i, atomic_t *v) |
78 | { | 79 | { |
79 | unsigned char c; | 80 | GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e"); |
80 | |||
81 | asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" | ||
82 | : "+m" (v->counter), "=qm" (c) | ||
83 | : "ir" (i) : "memory"); | ||
84 | return c; | ||
85 | } | 81 | } |
86 | 82 | ||
87 | /** | 83 | /** |
@@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v) | |||
118 | */ | 114 | */ |
119 | static inline int atomic_dec_and_test(atomic_t *v) | 115 | static inline int atomic_dec_and_test(atomic_t *v) |
120 | { | 116 | { |
121 | unsigned char c; | 117 | GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); |
122 | |||
123 | asm volatile(LOCK_PREFIX "decl %0; sete %1" | ||
124 | : "+m" (v->counter), "=qm" (c) | ||
125 | : : "memory"); | ||
126 | return c != 0; | ||
127 | } | 118 | } |
128 | 119 | ||
129 | /** | 120 | /** |
@@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v) | |||
136 | */ | 127 | */ |
137 | static inline int atomic_inc_and_test(atomic_t *v) | 128 | static inline int atomic_inc_and_test(atomic_t *v) |
138 | { | 129 | { |
139 | unsigned char c; | 130 | GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); |
140 | |||
141 | asm volatile(LOCK_PREFIX "incl %0; sete %1" | ||
142 | : "+m" (v->counter), "=qm" (c) | ||
143 | : : "memory"); | ||
144 | return c != 0; | ||
145 | } | 131 | } |
146 | 132 | ||
147 | /** | 133 | /** |
@@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v) | |||
155 | */ | 141 | */ |
156 | static inline int atomic_add_negative(int i, atomic_t *v) | 142 | static inline int atomic_add_negative(int i, atomic_t *v) |
157 | { | 143 | { |
158 | unsigned char c; | 144 | GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s"); |
159 | |||
160 | asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" | ||
161 | : "+m" (v->counter), "=qm" (c) | ||
162 | : "ir" (i) : "memory"); | ||
163 | return c; | ||
164 | } | 145 | } |
165 | 146 | ||
166 | /** | 147 | /** |
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 0e1cbfc8ee06..3f065c985aee 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h | |||
@@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) | |||
72 | */ | 72 | */ |
73 | static inline int atomic64_sub_and_test(long i, atomic64_t *v) | 73 | static inline int atomic64_sub_and_test(long i, atomic64_t *v) |
74 | { | 74 | { |
75 | unsigned char c; | 75 | GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e"); |
76 | |||
77 | asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" | ||
78 | : "=m" (v->counter), "=qm" (c) | ||
79 | : "er" (i), "m" (v->counter) : "memory"); | ||
80 | return c; | ||
81 | } | 76 | } |
82 | 77 | ||
83 | /** | 78 | /** |
@@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v) | |||
116 | */ | 111 | */ |
117 | static inline int atomic64_dec_and_test(atomic64_t *v) | 112 | static inline int atomic64_dec_and_test(atomic64_t *v) |
118 | { | 113 | { |
119 | unsigned char c; | 114 | GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); |
120 | |||
121 | asm volatile(LOCK_PREFIX "decq %0; sete %1" | ||
122 | : "=m" (v->counter), "=qm" (c) | ||
123 | : "m" (v->counter) : "memory"); | ||
124 | return c != 0; | ||
125 | } | 115 | } |
126 | 116 | ||
127 | /** | 117 | /** |
@@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v) | |||
134 | */ | 124 | */ |
135 | static inline int atomic64_inc_and_test(atomic64_t *v) | 125 | static inline int atomic64_inc_and_test(atomic64_t *v) |
136 | { | 126 | { |
137 | unsigned char c; | 127 | GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); |
138 | |||
139 | asm volatile(LOCK_PREFIX "incq %0; sete %1" | ||
140 | : "=m" (v->counter), "=qm" (c) | ||
141 | : "m" (v->counter) : "memory"); | ||
142 | return c != 0; | ||
143 | } | 128 | } |
144 | 129 | ||
145 | /** | 130 | /** |
@@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v) | |||
153 | */ | 138 | */ |
154 | static inline int atomic64_add_negative(long i, atomic64_t *v) | 139 | static inline int atomic64_add_negative(long i, atomic64_t *v) |
155 | { | 140 | { |
156 | unsigned char c; | 141 | GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s"); |
157 | |||
158 | asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" | ||
159 | : "=m" (v->counter), "=qm" (c) | ||
160 | : "er" (i), "m" (v->counter) : "memory"); | ||
161 | return c; | ||
162 | } | 142 | } |
163 | 143 | ||
164 | /** | 144 | /** |
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 41639ce8fd63..6d76d0935989 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h | |||
@@ -14,6 +14,7 @@ | |||
14 | 14 | ||
15 | #include <linux/compiler.h> | 15 | #include <linux/compiler.h> |
16 | #include <asm/alternative.h> | 16 | #include <asm/alternative.h> |
17 | #include <asm/rmwcc.h> | ||
17 | 18 | ||
18 | #if BITS_PER_LONG == 32 | 19 | #if BITS_PER_LONG == 32 |
19 | # define _BITOPS_LONG_SHIFT 5 | 20 | # define _BITOPS_LONG_SHIFT 5 |
@@ -204,12 +205,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) | |||
204 | */ | 205 | */ |
205 | static inline int test_and_set_bit(long nr, volatile unsigned long *addr) | 206 | static inline int test_and_set_bit(long nr, volatile unsigned long *addr) |
206 | { | 207 | { |
207 | int oldbit; | 208 | GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c"); |
208 | |||
209 | asm volatile(LOCK_PREFIX "bts %2,%1\n\t" | ||
210 | "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); | ||
211 | |||
212 | return oldbit; | ||
213 | } | 209 | } |
214 | 210 | ||
215 | /** | 211 | /** |
@@ -255,13 +251,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) | |||
255 | */ | 251 | */ |
256 | static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) | 252 | static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) |
257 | { | 253 | { |
258 | int oldbit; | 254 | GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c"); |
259 | |||
260 | asm volatile(LOCK_PREFIX "btr %2,%1\n\t" | ||
261 | "sbb %0,%0" | ||
262 | : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); | ||
263 | |||
264 | return oldbit; | ||
265 | } | 255 | } |
266 | 256 | ||
267 | /** | 257 | /** |
@@ -314,13 +304,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) | |||
314 | */ | 304 | */ |
315 | static inline int test_and_change_bit(long nr, volatile unsigned long *addr) | 305 | static inline int test_and_change_bit(long nr, volatile unsigned long *addr) |
316 | { | 306 | { |
317 | int oldbit; | 307 | GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c"); |
318 | |||
319 | asm volatile(LOCK_PREFIX "btc %2,%1\n\t" | ||
320 | "sbb %0,%0" | ||
321 | : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); | ||
322 | |||
323 | return oldbit; | ||
324 | } | 308 | } |
325 | 309 | ||
326 | static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) | 310 | static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 0fa675033912..cb4c73bfeb48 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
@@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with | |||
48 | 48 | ||
49 | #include <asm/dwarf2.h> | 49 | #include <asm/dwarf2.h> |
50 | 50 | ||
51 | #ifdef CONFIG_X86_64 | ||
52 | |||
51 | /* | 53 | /* |
52 | * 64-bit system call stack frame layout defines and helpers, | 54 | * 64-bit system call stack frame layout defines and helpers, |
53 | * for assembly code: | 55 | * for assembly code: |
@@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with | |||
192 | .macro icebp | 194 | .macro icebp |
193 | .byte 0xf1 | 195 | .byte 0xf1 |
194 | .endm | 196 | .endm |
197 | |||
198 | #else /* CONFIG_X86_64 */ | ||
199 | |||
200 | /* | ||
201 | * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These | ||
202 | * are different from the entry_32.S versions in not changing the segment | ||
203 | * registers. So only suitable for in kernel use, not when transitioning | ||
204 | * from or to user space. The resulting stack frame is not a standard | ||
205 | * pt_regs frame. The main use case is calling C code from assembler | ||
206 | * when all the registers need to be preserved. | ||
207 | */ | ||
208 | |||
209 | .macro SAVE_ALL | ||
210 | pushl_cfi %eax | ||
211 | CFI_REL_OFFSET eax, 0 | ||
212 | pushl_cfi %ebp | ||
213 | CFI_REL_OFFSET ebp, 0 | ||
214 | pushl_cfi %edi | ||
215 | CFI_REL_OFFSET edi, 0 | ||
216 | pushl_cfi %esi | ||
217 | CFI_REL_OFFSET esi, 0 | ||
218 | pushl_cfi %edx | ||
219 | CFI_REL_OFFSET edx, 0 | ||
220 | pushl_cfi %ecx | ||
221 | CFI_REL_OFFSET ecx, 0 | ||
222 | pushl_cfi %ebx | ||
223 | CFI_REL_OFFSET ebx, 0 | ||
224 | .endm | ||
225 | |||
226 | .macro RESTORE_ALL | ||
227 | popl_cfi %ebx | ||
228 | CFI_RESTORE ebx | ||
229 | popl_cfi %ecx | ||
230 | CFI_RESTORE ecx | ||
231 | popl_cfi %edx | ||
232 | CFI_RESTORE edx | ||
233 | popl_cfi %esi | ||
234 | CFI_RESTORE esi | ||
235 | popl_cfi %edi | ||
236 | CFI_RESTORE edi | ||
237 | popl_cfi %ebp | ||
238 | CFI_RESTORE ebp | ||
239 | popl_cfi %eax | ||
240 | CFI_RESTORE eax | ||
241 | .endm | ||
242 | |||
243 | #endif /* CONFIG_X86_64 */ | ||
244 | |||
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2d89e3980cbd..5b23e605e707 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h | |||
@@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l) | |||
52 | */ | 52 | */ |
53 | static inline int local_sub_and_test(long i, local_t *l) | 53 | static inline int local_sub_and_test(long i, local_t *l) |
54 | { | 54 | { |
55 | unsigned char c; | 55 | GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e"); |
56 | |||
57 | asm volatile(_ASM_SUB "%2,%0; sete %1" | ||
58 | : "+m" (l->a.counter), "=qm" (c) | ||
59 | : "ir" (i) : "memory"); | ||
60 | return c; | ||
61 | } | 56 | } |
62 | 57 | ||
63 | /** | 58 | /** |
@@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l) | |||
70 | */ | 65 | */ |
71 | static inline int local_dec_and_test(local_t *l) | 66 | static inline int local_dec_and_test(local_t *l) |
72 | { | 67 | { |
73 | unsigned char c; | 68 | GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); |
74 | |||
75 | asm volatile(_ASM_DEC "%0; sete %1" | ||
76 | : "+m" (l->a.counter), "=qm" (c) | ||
77 | : : "memory"); | ||
78 | return c != 0; | ||
79 | } | 69 | } |
80 | 70 | ||
81 | /** | 71 | /** |
@@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l) | |||
88 | */ | 78 | */ |
89 | static inline int local_inc_and_test(local_t *l) | 79 | static inline int local_inc_and_test(local_t *l) |
90 | { | 80 | { |
91 | unsigned char c; | 81 | GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); |
92 | |||
93 | asm volatile(_ASM_INC "%0; sete %1" | ||
94 | : "+m" (l->a.counter), "=qm" (c) | ||
95 | : : "memory"); | ||
96 | return c != 0; | ||
97 | } | 82 | } |
98 | 83 | ||
99 | /** | 84 | /** |
@@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l) | |||
107 | */ | 92 | */ |
108 | static inline int local_add_negative(long i, local_t *l) | 93 | static inline int local_add_negative(long i, local_t *l) |
109 | { | 94 | { |
110 | unsigned char c; | 95 | GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s"); |
111 | |||
112 | asm volatile(_ASM_ADD "%2,%0; sets %1" | ||
113 | : "+m" (l->a.counter), "=qm" (c) | ||
114 | : "ir" (i) : "memory"); | ||
115 | return c; | ||
116 | } | 96 | } |
117 | 97 | ||
118 | /** | 98 | /** |
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h new file mode 100644 index 000000000000..8729723636fd --- /dev/null +++ b/arch/x86/include/asm/preempt.h | |||
@@ -0,0 +1,100 @@ | |||
1 | #ifndef __ASM_PREEMPT_H | ||
2 | #define __ASM_PREEMPT_H | ||
3 | |||
4 | #include <asm/rmwcc.h> | ||
5 | #include <asm/percpu.h> | ||
6 | #include <linux/thread_info.h> | ||
7 | |||
8 | DECLARE_PER_CPU(int, __preempt_count); | ||
9 | |||
10 | /* | ||
11 | * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users | ||
12 | * that think a non-zero value indicates we cannot preempt. | ||
13 | */ | ||
14 | static __always_inline int preempt_count(void) | ||
15 | { | ||
16 | return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; | ||
17 | } | ||
18 | |||
19 | static __always_inline void preempt_count_set(int pc) | ||
20 | { | ||
21 | __this_cpu_write_4(__preempt_count, pc); | ||
22 | } | ||
23 | |||
24 | /* | ||
25 | * must be macros to avoid header recursion hell | ||
26 | */ | ||
27 | #define task_preempt_count(p) \ | ||
28 | (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) | ||
29 | |||
30 | #define init_task_preempt_count(p) do { \ | ||
31 | task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ | ||
32 | } while (0) | ||
33 | |||
34 | #define init_idle_preempt_count(p, cpu) do { \ | ||
35 | task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ | ||
36 | per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ | ||
37 | } while (0) | ||
38 | |||
39 | /* | ||
40 | * We fold the NEED_RESCHED bit into the preempt count such that | ||
41 | * preempt_enable() can decrement and test for needing to reschedule with a | ||
42 | * single instruction. | ||
43 | * | ||
44 | * We invert the actual bit, so that when the decrement hits 0 we know we both | ||
45 | * need to resched (the bit is cleared) and can resched (no preempt count). | ||
46 | */ | ||
47 | |||
48 | static __always_inline void set_preempt_need_resched(void) | ||
49 | { | ||
50 | __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); | ||
51 | } | ||
52 | |||
53 | static __always_inline void clear_preempt_need_resched(void) | ||
54 | { | ||
55 | __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); | ||
56 | } | ||
57 | |||
58 | static __always_inline bool test_preempt_need_resched(void) | ||
59 | { | ||
60 | return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * The various preempt_count add/sub methods | ||
65 | */ | ||
66 | |||
67 | static __always_inline void __preempt_count_add(int val) | ||
68 | { | ||
69 | __this_cpu_add_4(__preempt_count, val); | ||
70 | } | ||
71 | |||
72 | static __always_inline void __preempt_count_sub(int val) | ||
73 | { | ||
74 | __this_cpu_add_4(__preempt_count, -val); | ||
75 | } | ||
76 | |||
77 | static __always_inline bool __preempt_count_dec_and_test(void) | ||
78 | { | ||
79 | GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Returns true when we need to resched and can (barring IRQ state). | ||
84 | */ | ||
85 | static __always_inline bool should_resched(void) | ||
86 | { | ||
87 | return unlikely(!__this_cpu_read_4(__preempt_count)); | ||
88 | } | ||
89 | |||
90 | #ifdef CONFIG_PREEMPT | ||
91 | extern asmlinkage void ___preempt_schedule(void); | ||
92 | # define __preempt_schedule() asm ("call ___preempt_schedule") | ||
93 | extern asmlinkage void preempt_schedule(void); | ||
94 | # ifdef CONFIG_CONTEXT_TRACKING | ||
95 | extern asmlinkage void ___preempt_schedule_context(void); | ||
96 | # define __preempt_schedule_context() asm ("call ___preempt_schedule_context") | ||
97 | # endif | ||
98 | #endif | ||
99 | |||
100 | #endif /* __ASM_PREEMPT_H */ | ||
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h new file mode 100644 index 000000000000..735f1849795f --- /dev/null +++ b/arch/x86/include/asm/rmwcc.h | |||
@@ -0,0 +1,41 @@ | |||
1 | #ifndef _ASM_X86_RMWcc | ||
2 | #define _ASM_X86_RMWcc | ||
3 | |||
4 | #ifdef CC_HAVE_ASM_GOTO | ||
5 | |||
6 | #define __GEN_RMWcc(fullop, var, cc, ...) \ | ||
7 | do { \ | ||
8 | asm volatile goto (fullop "; j" cc " %l[cc_label]" \ | ||
9 | : : "m" (var), ## __VA_ARGS__ \ | ||
10 | : "memory" : cc_label); \ | ||
11 | return 0; \ | ||
12 | cc_label: \ | ||
13 | return 1; \ | ||
14 | } while (0) | ||
15 | |||
16 | #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ | ||
17 | __GEN_RMWcc(op " " arg0, var, cc) | ||
18 | |||
19 | #define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \ | ||
20 | __GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val)) | ||
21 | |||
22 | #else /* !CC_HAVE_ASM_GOTO */ | ||
23 | |||
24 | #define __GEN_RMWcc(fullop, var, cc, ...) \ | ||
25 | do { \ | ||
26 | char c; \ | ||
27 | asm volatile (fullop "; set" cc " %1" \ | ||
28 | : "+m" (var), "=qm" (c) \ | ||
29 | : __VA_ARGS__ : "memory"); \ | ||
30 | return c != 0; \ | ||
31 | } while (0) | ||
32 | |||
33 | #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ | ||
34 | __GEN_RMWcc(op " " arg0, var, cc) | ||
35 | |||
36 | #define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \ | ||
37 | __GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val)) | ||
38 | |||
39 | #endif /* CC_HAVE_ASM_GOTO */ | ||
40 | |||
41 | #endif /* _ASM_X86_RMWcc */ | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 27811190cbd7..c46a46be1ec6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -28,8 +28,7 @@ struct thread_info { | |||
28 | __u32 flags; /* low level flags */ | 28 | __u32 flags; /* low level flags */ |
29 | __u32 status; /* thread synchronous flags */ | 29 | __u32 status; /* thread synchronous flags */ |
30 | __u32 cpu; /* current CPU */ | 30 | __u32 cpu; /* current CPU */ |
31 | int preempt_count; /* 0 => preemptable, | 31 | int saved_preempt_count; |
32 | <0 => BUG */ | ||
33 | mm_segment_t addr_limit; | 32 | mm_segment_t addr_limit; |
34 | struct restart_block restart_block; | 33 | struct restart_block restart_block; |
35 | void __user *sysenter_return; | 34 | void __user *sysenter_return; |
@@ -49,7 +48,7 @@ struct thread_info { | |||
49 | .exec_domain = &default_exec_domain, \ | 48 | .exec_domain = &default_exec_domain, \ |
50 | .flags = 0, \ | 49 | .flags = 0, \ |
51 | .cpu = 0, \ | 50 | .cpu = 0, \ |
52 | .preempt_count = INIT_PREEMPT_COUNT, \ | 51 | .saved_preempt_count = INIT_PREEMPT_COUNT, \ |
53 | .addr_limit = KERNEL_DS, \ | 52 | .addr_limit = KERNEL_DS, \ |
54 | .restart_block = { \ | 53 | .restart_block = { \ |
55 | .fn = do_no_restart_syscall, \ | 54 | .fn = do_no_restart_syscall, \ |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a5408b965c9d..9b0a34e2cd79 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o | |||
36 | obj-y += pci-iommu_table.o | 36 | obj-y += pci-iommu_table.o |
37 | obj-y += resource.o | 37 | obj-y += resource.o |
38 | 38 | ||
39 | obj-$(CONFIG_PREEMPT) += preempt.o | ||
40 | |||
39 | obj-y += process.o | 41 | obj-y += process.o |
40 | obj-y += i387.o xsave.o | 42 | obj-y += i387.o xsave.o |
41 | obj-y += ptrace.o | 43 | obj-y += ptrace.o |
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 28610822fb3c..9f6b9341950f 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c | |||
@@ -32,7 +32,6 @@ void common(void) { | |||
32 | OFFSET(TI_flags, thread_info, flags); | 32 | OFFSET(TI_flags, thread_info, flags); |
33 | OFFSET(TI_status, thread_info, status); | 33 | OFFSET(TI_status, thread_info, status); |
34 | OFFSET(TI_addr_limit, thread_info, addr_limit); | 34 | OFFSET(TI_addr_limit, thread_info, addr_limit); |
35 | OFFSET(TI_preempt_count, thread_info, preempt_count); | ||
36 | 35 | ||
37 | BLANK(); | 36 | BLANK(); |
38 | OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); | 37 | OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2793d1f095a2..5223fe6dec7b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = | |||
1095 | 1095 | ||
1096 | DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; | 1096 | DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; |
1097 | 1097 | ||
1098 | DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; | ||
1099 | EXPORT_PER_CPU_SYMBOL(__preempt_count); | ||
1100 | |||
1098 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | 1101 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); |
1099 | 1102 | ||
1100 | /* | 1103 | /* |
@@ -1169,6 +1172,8 @@ void debug_stack_reset(void) | |||
1169 | 1172 | ||
1170 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | 1173 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; |
1171 | EXPORT_PER_CPU_SYMBOL(current_task); | 1174 | EXPORT_PER_CPU_SYMBOL(current_task); |
1175 | DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; | ||
1176 | EXPORT_PER_CPU_SYMBOL(__preempt_count); | ||
1172 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | 1177 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); |
1173 | 1178 | ||
1174 | #ifdef CONFIG_CC_STACKPROTECTOR | 1179 | #ifdef CONFIG_CC_STACKPROTECTOR |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0dcb0ceb6a2..fd1bc1b15e6d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -362,12 +362,9 @@ END(ret_from_exception) | |||
362 | #ifdef CONFIG_PREEMPT | 362 | #ifdef CONFIG_PREEMPT |
363 | ENTRY(resume_kernel) | 363 | ENTRY(resume_kernel) |
364 | DISABLE_INTERRUPTS(CLBR_ANY) | 364 | DISABLE_INTERRUPTS(CLBR_ANY) |
365 | cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? | ||
366 | jnz restore_all | ||
367 | need_resched: | 365 | need_resched: |
368 | movl TI_flags(%ebp), %ecx # need_resched set ? | 366 | cmpl $0,PER_CPU_VAR(__preempt_count) |
369 | testb $_TIF_NEED_RESCHED, %cl | 367 | jnz restore_all |
370 | jz restore_all | ||
371 | testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? | 368 | testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? |
372 | jz restore_all | 369 | jz restore_all |
373 | call preempt_schedule_irq | 370 | call preempt_schedule_irq |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b077f4cc225a..1a2cc64abcd7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1103,10 +1103,8 @@ retint_signal: | |||
1103 | /* Returning to kernel space. Check if we need preemption */ | 1103 | /* Returning to kernel space. Check if we need preemption */ |
1104 | /* rcx: threadinfo. interrupts off. */ | 1104 | /* rcx: threadinfo. interrupts off. */ |
1105 | ENTRY(retint_kernel) | 1105 | ENTRY(retint_kernel) |
1106 | cmpl $0,TI_preempt_count(%rcx) | 1106 | cmpl $0,PER_CPU_VAR(__preempt_count) |
1107 | jnz retint_restore_args | 1107 | jnz retint_restore_args |
1108 | bt $TIF_NEED_RESCHED,TI_flags(%rcx) | ||
1109 | jnc retint_restore_args | ||
1110 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | 1108 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ |
1111 | jnc retint_restore_args | 1109 | jnc retint_restore_args |
1112 | call preempt_schedule_irq | 1110 | call preempt_schedule_irq |
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 0fa69127209a..05fd74f537d6 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c | |||
@@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr); | |||
37 | 37 | ||
38 | EXPORT_SYMBOL(csum_partial); | 38 | EXPORT_SYMBOL(csum_partial); |
39 | EXPORT_SYMBOL(empty_zero_page); | 39 | EXPORT_SYMBOL(empty_zero_page); |
40 | |||
41 | #ifdef CONFIG_PREEMPT | ||
42 | EXPORT_SYMBOL(___preempt_schedule); | ||
43 | #ifdef CONFIG_CONTEXT_TRACKING | ||
44 | EXPORT_SYMBOL(___preempt_schedule_context); | ||
45 | #endif | ||
46 | #endif | ||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4186755f1d7c..3fe066359ac0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
100 | irqctx->tinfo.task = curctx->tinfo.task; | 100 | irqctx->tinfo.task = curctx->tinfo.task; |
101 | irqctx->tinfo.previous_esp = current_stack_pointer; | 101 | irqctx->tinfo.previous_esp = current_stack_pointer; |
102 | 102 | ||
103 | /* Copy the preempt_count so that the [soft]irq checks work. */ | ||
104 | irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; | ||
105 | |||
106 | if (unlikely(overflow)) | 103 | if (unlikely(overflow)) |
107 | call_on_stack(print_stack_overflow, isp); | 104 | call_on_stack(print_stack_overflow, isp); |
108 | 105 | ||
@@ -131,7 +128,6 @@ void irq_ctx_init(int cpu) | |||
131 | THREAD_SIZE_ORDER)); | 128 | THREAD_SIZE_ORDER)); |
132 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); | 129 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); |
133 | irqctx->tinfo.cpu = cpu; | 130 | irqctx->tinfo.cpu = cpu; |
134 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | ||
135 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 131 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
136 | 132 | ||
137 | per_cpu(hardirq_ctx, cpu) = irqctx; | 133 | per_cpu(hardirq_ctx, cpu) = irqctx; |
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S new file mode 100644 index 000000000000..ca7f0d58a87d --- /dev/null +++ b/arch/x86/kernel/preempt.S | |||
@@ -0,0 +1,25 @@ | |||
1 | |||
2 | #include <linux/linkage.h> | ||
3 | #include <asm/dwarf2.h> | ||
4 | #include <asm/asm.h> | ||
5 | #include <asm/calling.h> | ||
6 | |||
7 | ENTRY(___preempt_schedule) | ||
8 | CFI_STARTPROC | ||
9 | SAVE_ALL | ||
10 | call preempt_schedule | ||
11 | RESTORE_ALL | ||
12 | ret | ||
13 | CFI_ENDPROC | ||
14 | |||
15 | #ifdef CONFIG_CONTEXT_TRACKING | ||
16 | |||
17 | ENTRY(___preempt_schedule_context) | ||
18 | CFI_STARTPROC | ||
19 | SAVE_ALL | ||
20 | call preempt_schedule_context | ||
21 | RESTORE_ALL | ||
22 | ret | ||
23 | CFI_ENDPROC | ||
24 | |||
25 | #endif | ||
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c83516be1052..3fb8d95ab8b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -391,9 +391,9 @@ static void amd_e400_idle(void) | |||
391 | * The switch back from broadcast mode needs to be | 391 | * The switch back from broadcast mode needs to be |
392 | * called with interrupts disabled. | 392 | * called with interrupts disabled. |
393 | */ | 393 | */ |
394 | local_irq_disable(); | 394 | local_irq_disable(); |
395 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); | 395 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); |
396 | local_irq_enable(); | 396 | local_irq_enable(); |
397 | } else | 397 | } else |
398 | default_idle(); | 398 | default_idle(); |
399 | } | 399 | } |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 884f98f69354..c2ec1aa6d454 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -292,6 +292,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
292 | set_iopl_mask(next->iopl); | 292 | set_iopl_mask(next->iopl); |
293 | 293 | ||
294 | /* | 294 | /* |
295 | * If it were not for PREEMPT_ACTIVE we could guarantee that the | ||
296 | * preempt_count of all tasks was equal here and this would not be | ||
297 | * needed. | ||
298 | */ | ||
299 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | ||
300 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | ||
301 | |||
302 | /* | ||
295 | * Now maybe handle debug registers and/or IO bitmaps | 303 | * Now maybe handle debug registers and/or IO bitmaps |
296 | */ | 304 | */ |
297 | if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || | 305 | if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bb1dc51bab05..45ab4d6fc8a7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
363 | this_cpu_write(old_rsp, next->usersp); | 363 | this_cpu_write(old_rsp, next->usersp); |
364 | this_cpu_write(current_task, next_p); | 364 | this_cpu_write(current_task, next_p); |
365 | 365 | ||
366 | /* | ||
367 | * If it were not for PREEMPT_ACTIVE we could guarantee that the | ||
368 | * preempt_count of all tasks was equal here and this would not be | ||
369 | * needed. | ||
370 | */ | ||
371 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | ||
372 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | ||
373 | |||
366 | this_cpu_write(kernel_stack, | 374 | this_cpu_write(kernel_stack, |
367 | (unsigned long)task_stack_page(next_p) + | 375 | (unsigned long)task_stack_page(next_p) + |
368 | THREAD_SIZE - KERNEL_STACK_OFFSET); | 376 | THREAD_SIZE - KERNEL_STACK_OFFSET); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8c8093b146ca..729aa779ff75 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs) | |||
88 | 88 | ||
89 | static inline void preempt_conditional_sti(struct pt_regs *regs) | 89 | static inline void preempt_conditional_sti(struct pt_regs *regs) |
90 | { | 90 | { |
91 | inc_preempt_count(); | 91 | preempt_count_inc(); |
92 | if (regs->flags & X86_EFLAGS_IF) | 92 | if (regs->flags & X86_EFLAGS_IF) |
93 | local_irq_enable(); | 93 | local_irq_enable(); |
94 | } | 94 | } |
@@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
103 | { | 103 | { |
104 | if (regs->flags & X86_EFLAGS_IF) | 104 | if (regs->flags & X86_EFLAGS_IF) |
105 | local_irq_disable(); | 105 | local_irq_disable(); |
106 | dec_preempt_count(); | 106 | preempt_count_dec(); |
107 | } | 107 | } |
108 | 108 | ||
109 | static int __kprobes | 109 | static int __kprobes |
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b014d9414d08..040681928e9d 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c | |||
@@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page); | |||
66 | #ifndef CONFIG_PARAVIRT | 66 | #ifndef CONFIG_PARAVIRT |
67 | EXPORT_SYMBOL(native_load_gs_index); | 67 | EXPORT_SYMBOL(native_load_gs_index); |
68 | #endif | 68 | #endif |
69 | |||
70 | #ifdef CONFIG_PREEMPT | ||
71 | EXPORT_SYMBOL(___preempt_schedule); | ||
72 | #ifdef CONFIG_CONTEXT_TRACKING | ||
73 | EXPORT_SYMBOL(___preempt_schedule_context); | ||
74 | #endif | ||
75 | #endif | ||
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 1b982641ec35..228d6aee3a16 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild | |||
@@ -28,3 +28,4 @@ generic-y += termios.h | |||
28 | generic-y += topology.h | 28 | generic-y += topology.h |
29 | generic-y += trace_clock.h | 29 | generic-y += trace_clock.h |
30 | generic-y += xor.h | 30 | generic-y += xor.h |
31 | generic-y += preempt.h | ||
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index f98dd00b51a9..c7414a545a4f 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c | |||
@@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = { | |||
119 | */ | 119 | */ |
120 | static void acpi_safe_halt(void) | 120 | static void acpi_safe_halt(void) |
121 | { | 121 | { |
122 | current_thread_info()->status &= ~TS_POLLING; | 122 | if (!tif_need_resched()) { |
123 | /* | ||
124 | * TS_POLLING-cleared state must be visible before we | ||
125 | * test NEED_RESCHED: | ||
126 | */ | ||
127 | smp_mb(); | ||
128 | if (!need_resched()) { | ||
129 | safe_halt(); | 123 | safe_halt(); |
130 | local_irq_disable(); | 124 | local_irq_disable(); |
131 | } | 125 | } |
132 | current_thread_info()->status |= TS_POLLING; | ||
133 | } | 126 | } |
134 | 127 | ||
135 | #ifdef ARCH_APICTIMER_STOPS_ON_C3 | 128 | #ifdef ARCH_APICTIMER_STOPS_ON_C3 |
@@ -737,6 +730,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, | |||
737 | if (unlikely(!pr)) | 730 | if (unlikely(!pr)) |
738 | return -EINVAL; | 731 | return -EINVAL; |
739 | 732 | ||
733 | if (cx->entry_method == ACPI_CSTATE_FFH) { | ||
734 | if (current_set_polling_and_test()) | ||
735 | return -EINVAL; | ||
736 | } | ||
737 | |||
740 | lapic_timer_state_broadcast(pr, cx, 1); | 738 | lapic_timer_state_broadcast(pr, cx, 1); |
741 | acpi_idle_do_entry(cx); | 739 | acpi_idle_do_entry(cx); |
742 | 740 | ||
@@ -790,18 +788,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, | |||
790 | if (unlikely(!pr)) | 788 | if (unlikely(!pr)) |
791 | return -EINVAL; | 789 | return -EINVAL; |
792 | 790 | ||
793 | if (cx->entry_method != ACPI_CSTATE_FFH) { | 791 | if (cx->entry_method == ACPI_CSTATE_FFH) { |
794 | current_thread_info()->status &= ~TS_POLLING; | 792 | if (current_set_polling_and_test()) |
795 | /* | ||
796 | * TS_POLLING-cleared state must be visible before we test | ||
797 | * NEED_RESCHED: | ||
798 | */ | ||
799 | smp_mb(); | ||
800 | |||
801 | if (unlikely(need_resched())) { | ||
802 | current_thread_info()->status |= TS_POLLING; | ||
803 | return -EINVAL; | 793 | return -EINVAL; |
804 | } | ||
805 | } | 794 | } |
806 | 795 | ||
807 | /* | 796 | /* |
@@ -819,9 +808,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, | |||
819 | 808 | ||
820 | sched_clock_idle_wakeup_event(0); | 809 | sched_clock_idle_wakeup_event(0); |
821 | 810 | ||
822 | if (cx->entry_method != ACPI_CSTATE_FFH) | ||
823 | current_thread_info()->status |= TS_POLLING; | ||
824 | |||
825 | lapic_timer_state_broadcast(pr, cx, 0); | 811 | lapic_timer_state_broadcast(pr, cx, 0); |
826 | return index; | 812 | return index; |
827 | } | 813 | } |
@@ -858,18 +844,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, | |||
858 | } | 844 | } |
859 | } | 845 | } |
860 | 846 | ||
861 | if (cx->entry_method != ACPI_CSTATE_FFH) { | 847 | if (cx->entry_method == ACPI_CSTATE_FFH) { |
862 | current_thread_info()->status &= ~TS_POLLING; | 848 | if (current_set_polling_and_test()) |
863 | /* | ||
864 | * TS_POLLING-cleared state must be visible before we test | ||
865 | * NEED_RESCHED: | ||
866 | */ | ||
867 | smp_mb(); | ||
868 | |||
869 | if (unlikely(need_resched())) { | ||
870 | current_thread_info()->status |= TS_POLLING; | ||
871 | return -EINVAL; | 849 | return -EINVAL; |
872 | } | ||
873 | } | 850 | } |
874 | 851 | ||
875 | acpi_unlazy_tlb(smp_processor_id()); | 852 | acpi_unlazy_tlb(smp_processor_id()); |
@@ -915,9 +892,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, | |||
915 | 892 | ||
916 | sched_clock_idle_wakeup_event(0); | 893 | sched_clock_idle_wakeup_event(0); |
917 | 894 | ||
918 | if (cx->entry_method != ACPI_CSTATE_FFH) | ||
919 | current_thread_info()->status |= TS_POLLING; | ||
920 | |||
921 | lapic_timer_state_broadcast(pr, cx, 0); | 895 | lapic_timer_state_broadcast(pr, cx, 0); |
922 | return index; | 896 | return index; |
923 | } | 897 | } |
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index fa6964d8681a..f116d664b473 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c | |||
@@ -359,7 +359,7 @@ static int intel_idle(struct cpuidle_device *dev, | |||
359 | if (!(lapic_timer_reliable_states & (1 << (cstate)))) | 359 | if (!(lapic_timer_reliable_states & (1 << (cstate)))) |
360 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 360 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); |
361 | 361 | ||
362 | if (!need_resched()) { | 362 | if (!current_set_polling_and_test()) { |
363 | 363 | ||
364 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 364 | __monitor((void *)¤t_thread_info()->flags, 0, 0); |
365 | smp_mb(); | 365 | smp_mb(); |
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename, | |||
1547 | current->fs->in_exec = 0; | 1547 | current->fs->in_exec = 0; |
1548 | current->in_execve = 0; | 1548 | current->in_execve = 0; |
1549 | acct_update_integrals(current); | 1549 | acct_update_integrals(current); |
1550 | task_numa_free(current); | ||
1550 | free_bprm(bprm); | 1551 | free_bprm(bprm); |
1551 | if (displaced) | 1552 | if (displaced) |
1552 | put_files_struct(displaced); | 1553 | put_files_struct(displaced); |
diff --git a/fs/proc/array.c b/fs/proc/array.c index cbd0f1b324b9..1bd2077187fd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, | |||
183 | seq_printf(m, | 183 | seq_printf(m, |
184 | "State:\t%s\n" | 184 | "State:\t%s\n" |
185 | "Tgid:\t%d\n" | 185 | "Tgid:\t%d\n" |
186 | "Ngid:\t%d\n" | ||
186 | "Pid:\t%d\n" | 187 | "Pid:\t%d\n" |
187 | "PPid:\t%d\n" | 188 | "PPid:\t%d\n" |
188 | "TracerPid:\t%d\n" | 189 | "TracerPid:\t%d\n" |
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, | |||
190 | "Gid:\t%d\t%d\t%d\t%d\n", | 191 | "Gid:\t%d\t%d\t%d\t%d\n", |
191 | get_task_state(p), | 192 | get_task_state(p), |
192 | task_tgid_nr_ns(p, ns), | 193 | task_tgid_nr_ns(p, ns), |
194 | task_numa_group_id(p), | ||
193 | pid_nr_ns(pid, ns), | 195 | pid_nr_ns(pid, ns), |
194 | ppid, tpid, | 196 | ppid, tpid, |
195 | from_kuid_munged(user_ns, cred->uid), | 197 | from_kuid_munged(user_ns, cred->uid), |
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h new file mode 100644 index 000000000000..ddf2b420ac8f --- /dev/null +++ b/include/asm-generic/preempt.h | |||
@@ -0,0 +1,105 @@ | |||
1 | #ifndef __ASM_PREEMPT_H | ||
2 | #define __ASM_PREEMPT_H | ||
3 | |||
4 | #include <linux/thread_info.h> | ||
5 | |||
6 | /* | ||
7 | * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users | ||
8 | * that think a non-zero value indicates we cannot preempt. | ||
9 | */ | ||
10 | static __always_inline int preempt_count(void) | ||
11 | { | ||
12 | return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; | ||
13 | } | ||
14 | |||
15 | static __always_inline int *preempt_count_ptr(void) | ||
16 | { | ||
17 | return ¤t_thread_info()->preempt_count; | ||
18 | } | ||
19 | |||
20 | /* | ||
21 | * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the | ||
22 | * alternative is loosing a reschedule. Better schedule too often -- also this | ||
23 | * should be a very rare operation. | ||
24 | */ | ||
25 | static __always_inline void preempt_count_set(int pc) | ||
26 | { | ||
27 | *preempt_count_ptr() = pc; | ||
28 | } | ||
29 | |||
30 | /* | ||
31 | * must be macros to avoid header recursion hell | ||
32 | */ | ||
33 | #define task_preempt_count(p) \ | ||
34 | (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED) | ||
35 | |||
36 | #define init_task_preempt_count(p) do { \ | ||
37 | task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ | ||
38 | } while (0) | ||
39 | |||
40 | #define init_idle_preempt_count(p, cpu) do { \ | ||
41 | task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ | ||
42 | } while (0) | ||
43 | |||
44 | /* | ||
45 | * We fold the NEED_RESCHED bit into the preempt count such that | ||
46 | * preempt_enable() can decrement and test for needing to reschedule with a | ||
47 | * single instruction. | ||
48 | * | ||
49 | * We invert the actual bit, so that when the decrement hits 0 we know we both | ||
50 | * need to resched (the bit is cleared) and can resched (no preempt count). | ||
51 | */ | ||
52 | |||
53 | static __always_inline void set_preempt_need_resched(void) | ||
54 | { | ||
55 | *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; | ||
56 | } | ||
57 | |||
58 | static __always_inline void clear_preempt_need_resched(void) | ||
59 | { | ||
60 | *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; | ||
61 | } | ||
62 | |||
63 | static __always_inline bool test_preempt_need_resched(void) | ||
64 | { | ||
65 | return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * The various preempt_count add/sub methods | ||
70 | */ | ||
71 | |||
72 | static __always_inline void __preempt_count_add(int val) | ||
73 | { | ||
74 | *preempt_count_ptr() += val; | ||
75 | } | ||
76 | |||
77 | static __always_inline void __preempt_count_sub(int val) | ||
78 | { | ||
79 | *preempt_count_ptr() -= val; | ||
80 | } | ||
81 | |||
82 | static __always_inline bool __preempt_count_dec_and_test(void) | ||
83 | { | ||
84 | return !--*preempt_count_ptr(); | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * Returns true when we need to resched and can (barring IRQ state). | ||
89 | */ | ||
90 | static __always_inline bool should_resched(void) | ||
91 | { | ||
92 | return unlikely(!*preempt_count_ptr()); | ||
93 | } | ||
94 | |||
95 | #ifdef CONFIG_PREEMPT | ||
96 | extern asmlinkage void preempt_schedule(void); | ||
97 | #define __preempt_schedule() preempt_schedule() | ||
98 | |||
99 | #ifdef CONFIG_CONTEXT_TRACKING | ||
100 | extern asmlinkage void preempt_schedule_context(void); | ||
101 | #define __preempt_schedule_context() preempt_schedule_context() | ||
102 | #endif | ||
103 | #endif /* CONFIG_PREEMPT */ | ||
104 | |||
105 | #endif /* __ASM_PREEMPT_H */ | ||
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 1e041063b226..d9cf963ac832 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
@@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void); | |||
33 | #define __irq_enter() \ | 33 | #define __irq_enter() \ |
34 | do { \ | 34 | do { \ |
35 | account_irq_enter_time(current); \ | 35 | account_irq_enter_time(current); \ |
36 | add_preempt_count(HARDIRQ_OFFSET); \ | 36 | preempt_count_add(HARDIRQ_OFFSET); \ |
37 | trace_hardirq_enter(); \ | 37 | trace_hardirq_enter(); \ |
38 | } while (0) | 38 | } while (0) |
39 | 39 | ||
@@ -49,7 +49,7 @@ extern void irq_enter(void); | |||
49 | do { \ | 49 | do { \ |
50 | trace_hardirq_exit(); \ | 50 | trace_hardirq_exit(); \ |
51 | account_irq_exit_time(current); \ | 51 | account_irq_exit_time(current); \ |
52 | sub_preempt_count(HARDIRQ_OFFSET); \ | 52 | preempt_count_sub(HARDIRQ_OFFSET); \ |
53 | } while (0) | 53 | } while (0) |
54 | 54 | ||
55 | /* | 55 | /* |
@@ -62,7 +62,7 @@ extern void irq_exit(void); | |||
62 | lockdep_off(); \ | 62 | lockdep_off(); \ |
63 | ftrace_nmi_enter(); \ | 63 | ftrace_nmi_enter(); \ |
64 | BUG_ON(in_nmi()); \ | 64 | BUG_ON(in_nmi()); \ |
65 | add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ | 65 | preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ |
66 | rcu_nmi_enter(); \ | 66 | rcu_nmi_enter(); \ |
67 | trace_hardirq_enter(); \ | 67 | trace_hardirq_enter(); \ |
68 | } while (0) | 68 | } while (0) |
@@ -72,7 +72,7 @@ extern void irq_exit(void); | |||
72 | trace_hardirq_exit(); \ | 72 | trace_hardirq_exit(); \ |
73 | rcu_nmi_exit(); \ | 73 | rcu_nmi_exit(); \ |
74 | BUG_ON(!in_nmi()); \ | 74 | BUG_ON(!in_nmi()); \ |
75 | sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ | 75 | preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ |
76 | ftrace_nmi_exit(); \ | 76 | ftrace_nmi_exit(); \ |
77 | lockdep_on(); \ | 77 | lockdep_on(); \ |
78 | } while (0) | 78 | } while (0) |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index da6716b9e3fe..ea4d2495c646 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, | |||
136 | 136 | ||
137 | struct mempolicy *get_vma_policy(struct task_struct *tsk, | 137 | struct mempolicy *get_vma_policy(struct task_struct *tsk, |
138 | struct vm_area_struct *vma, unsigned long addr); | 138 | struct vm_area_struct *vma, unsigned long addr); |
139 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); | ||
139 | 140 | ||
140 | extern void numa_default_policy(void); | 141 | extern void numa_default_policy(void); |
141 | extern void numa_policy_init(void); | 142 | extern void numa_policy_init(void); |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8d3c57fdf221..f5096b58b20d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
90 | #endif /* CONFIG_MIGRATION */ | 90 | #endif /* CONFIG_MIGRATION */ |
91 | 91 | ||
92 | #ifdef CONFIG_NUMA_BALANCING | 92 | #ifdef CONFIG_NUMA_BALANCING |
93 | extern int migrate_misplaced_page(struct page *page, int node); | 93 | extern int migrate_misplaced_page(struct page *page, |
94 | extern int migrate_misplaced_page(struct page *page, int node); | 94 | struct vm_area_struct *vma, int node); |
95 | extern bool migrate_ratelimited(int node); | 95 | extern bool migrate_ratelimited(int node); |
96 | #else | 96 | #else |
97 | static inline int migrate_misplaced_page(struct page *page, int node) | 97 | static inline int migrate_misplaced_page(struct page *page, |
98 | struct vm_area_struct *vma, int node) | ||
98 | { | 99 | { |
99 | return -EAGAIN; /* can't migrate now */ | 100 | return -EAGAIN; /* can't migrate now */ |
100 | } | 101 | } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55ee8855..81443d557a2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
581 | * sets it, so none of the operations on it need to be atomic. | 581 | * sets it, so none of the operations on it need to be atomic. |
582 | */ | 582 | */ |
583 | 583 | ||
584 | /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ | 584 | /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ |
585 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) | 585 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) |
586 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) | 586 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) |
587 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) | 587 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) |
588 | #define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) | 588 | #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) |
589 | 589 | ||
590 | /* | 590 | /* |
591 | * Define the bit shifts to access each section. For non-existent | 591 | * Define the bit shifts to access each section. For non-existent |
@@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
595 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) | 595 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) |
596 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) | 596 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) |
597 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) | 597 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) |
598 | #define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) | 598 | #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) |
599 | 599 | ||
600 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ | 600 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ |
601 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 601 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
@@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
617 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) | 617 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) |
618 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) | 618 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) |
619 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) | 619 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) |
620 | #define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) | 620 | #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) |
621 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) | 621 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) |
622 | 622 | ||
623 | static inline enum zone_type page_zonenum(const struct page *page) | 623 | static inline enum zone_type page_zonenum(const struct page *page) |
@@ -661,51 +661,117 @@ static inline int page_to_nid(const struct page *page) | |||
661 | #endif | 661 | #endif |
662 | 662 | ||
663 | #ifdef CONFIG_NUMA_BALANCING | 663 | #ifdef CONFIG_NUMA_BALANCING |
664 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | 664 | static inline int cpu_pid_to_cpupid(int cpu, int pid) |
665 | static inline int page_nid_xchg_last(struct page *page, int nid) | ||
666 | { | 665 | { |
667 | return xchg(&page->_last_nid, nid); | 666 | return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); |
668 | } | 667 | } |
669 | 668 | ||
670 | static inline int page_nid_last(struct page *page) | 669 | static inline int cpupid_to_pid(int cpupid) |
671 | { | 670 | { |
672 | return page->_last_nid; | 671 | return cpupid & LAST__PID_MASK; |
673 | } | 672 | } |
674 | static inline void page_nid_reset_last(struct page *page) | 673 | |
674 | static inline int cpupid_to_cpu(int cpupid) | ||
675 | { | 675 | { |
676 | page->_last_nid = -1; | 676 | return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; |
677 | } | 677 | } |
678 | #else | 678 | |
679 | static inline int page_nid_last(struct page *page) | 679 | static inline int cpupid_to_nid(int cpupid) |
680 | { | 680 | { |
681 | return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; | 681 | return cpu_to_node(cpupid_to_cpu(cpupid)); |
682 | } | 682 | } |
683 | 683 | ||
684 | extern int page_nid_xchg_last(struct page *page, int nid); | 684 | static inline bool cpupid_pid_unset(int cpupid) |
685 | { | ||
686 | return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); | ||
687 | } | ||
685 | 688 | ||
686 | static inline void page_nid_reset_last(struct page *page) | 689 | static inline bool cpupid_cpu_unset(int cpupid) |
687 | { | 690 | { |
688 | int nid = (1 << LAST_NID_SHIFT) - 1; | 691 | return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); |
692 | } | ||
689 | 693 | ||
690 | page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | 694 | static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) |
691 | page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | 695 | { |
696 | return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); | ||
697 | } | ||
698 | |||
699 | #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) | ||
700 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS | ||
701 | static inline int page_cpupid_xchg_last(struct page *page, int cpupid) | ||
702 | { | ||
703 | return xchg(&page->_last_cpupid, cpupid); | ||
704 | } | ||
705 | |||
706 | static inline int page_cpupid_last(struct page *page) | ||
707 | { | ||
708 | return page->_last_cpupid; | ||
709 | } | ||
710 | static inline void page_cpupid_reset_last(struct page *page) | ||
711 | { | ||
712 | page->_last_cpupid = -1; | ||
692 | } | 713 | } |
693 | #endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ | ||
694 | #else | 714 | #else |
695 | static inline int page_nid_xchg_last(struct page *page, int nid) | 715 | static inline int page_cpupid_last(struct page *page) |
696 | { | 716 | { |
697 | return page_to_nid(page); | 717 | return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; |
698 | } | 718 | } |
699 | 719 | ||
700 | static inline int page_nid_last(struct page *page) | 720 | extern int page_cpupid_xchg_last(struct page *page, int cpupid); |
721 | |||
722 | static inline void page_cpupid_reset_last(struct page *page) | ||
701 | { | 723 | { |
702 | return page_to_nid(page); | 724 | int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; |
725 | |||
726 | page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); | ||
727 | page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; | ||
728 | } | ||
729 | #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ | ||
730 | #else /* !CONFIG_NUMA_BALANCING */ | ||
731 | static inline int page_cpupid_xchg_last(struct page *page, int cpupid) | ||
732 | { | ||
733 | return page_to_nid(page); /* XXX */ | ||
703 | } | 734 | } |
704 | 735 | ||
705 | static inline void page_nid_reset_last(struct page *page) | 736 | static inline int page_cpupid_last(struct page *page) |
706 | { | 737 | { |
738 | return page_to_nid(page); /* XXX */ | ||
707 | } | 739 | } |
708 | #endif | 740 | |
741 | static inline int cpupid_to_nid(int cpupid) | ||
742 | { | ||
743 | return -1; | ||
744 | } | ||
745 | |||
746 | static inline int cpupid_to_pid(int cpupid) | ||
747 | { | ||
748 | return -1; | ||
749 | } | ||
750 | |||
751 | static inline int cpupid_to_cpu(int cpupid) | ||
752 | { | ||
753 | return -1; | ||
754 | } | ||
755 | |||
756 | static inline int cpu_pid_to_cpupid(int nid, int pid) | ||
757 | { | ||
758 | return -1; | ||
759 | } | ||
760 | |||
761 | static inline bool cpupid_pid_unset(int cpupid) | ||
762 | { | ||
763 | return 1; | ||
764 | } | ||
765 | |||
766 | static inline void page_cpupid_reset_last(struct page *page) | ||
767 | { | ||
768 | } | ||
769 | |||
770 | static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) | ||
771 | { | ||
772 | return false; | ||
773 | } | ||
774 | #endif /* CONFIG_NUMA_BALANCING */ | ||
709 | 775 | ||
710 | static inline struct zone *page_zone(const struct page *page) | 776 | static inline struct zone *page_zone(const struct page *page) |
711 | { | 777 | { |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d9851eeb6e1d..a3198e5aaf4e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -174,8 +174,8 @@ struct page { | |||
174 | void *shadow; | 174 | void *shadow; |
175 | #endif | 175 | #endif |
176 | 176 | ||
177 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | 177 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
178 | int _last_nid; | 178 | int _last_cpupid; |
179 | #endif | 179 | #endif |
180 | } | 180 | } |
181 | /* | 181 | /* |
@@ -420,28 +420,15 @@ struct mm_struct { | |||
420 | */ | 420 | */ |
421 | unsigned long numa_next_scan; | 421 | unsigned long numa_next_scan; |
422 | 422 | ||
423 | /* numa_next_reset is when the PTE scanner period will be reset */ | ||
424 | unsigned long numa_next_reset; | ||
425 | |||
426 | /* Restart point for scanning and setting pte_numa */ | 423 | /* Restart point for scanning and setting pte_numa */ |
427 | unsigned long numa_scan_offset; | 424 | unsigned long numa_scan_offset; |
428 | 425 | ||
429 | /* numa_scan_seq prevents two threads setting pte_numa */ | 426 | /* numa_scan_seq prevents two threads setting pte_numa */ |
430 | int numa_scan_seq; | 427 | int numa_scan_seq; |
431 | |||
432 | /* | ||
433 | * The first node a task was scheduled on. If a task runs on | ||
434 | * a different node than Make PTE Scan Go Now. | ||
435 | */ | ||
436 | int first_nid; | ||
437 | #endif | 428 | #endif |
438 | struct uprobes_state uprobes_state; | 429 | struct uprobes_state uprobes_state; |
439 | }; | 430 | }; |
440 | 431 | ||
441 | /* first nid will either be a valid NID or one of these values */ | ||
442 | #define NUMA_PTE_SCAN_INIT -1 | ||
443 | #define NUMA_PTE_SCAN_ACTIVE -2 | ||
444 | |||
445 | static inline void mm_init_cpumask(struct mm_struct *mm) | 432 | static inline void mm_init_cpumask(struct mm_struct *mm) |
446 | { | 433 | { |
447 | #ifdef CONFIG_CPUMASK_OFFSTACK | 434 | #ifdef CONFIG_CPUMASK_OFFSTACK |
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 93506a114034..da523661500a 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h | |||
@@ -38,10 +38,10 @@ | |||
38 | * The last is when there is insufficient space in page->flags and a separate | 38 | * The last is when there is insufficient space in page->flags and a separate |
39 | * lookup is necessary. | 39 | * lookup is necessary. |
40 | * | 40 | * |
41 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | | 41 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | |
42 | * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | | 42 | * " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS | |
43 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | | 43 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | |
44 | * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | | 44 | * " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS | |
45 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | | 45 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | |
46 | */ | 46 | */ |
47 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 47 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
@@ -62,15 +62,21 @@ | |||
62 | #endif | 62 | #endif |
63 | 63 | ||
64 | #ifdef CONFIG_NUMA_BALANCING | 64 | #ifdef CONFIG_NUMA_BALANCING |
65 | #define LAST_NID_SHIFT NODES_SHIFT | 65 | #define LAST__PID_SHIFT 8 |
66 | #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) | ||
67 | |||
68 | #define LAST__CPU_SHIFT NR_CPUS_BITS | ||
69 | #define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1) | ||
70 | |||
71 | #define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT) | ||
66 | #else | 72 | #else |
67 | #define LAST_NID_SHIFT 0 | 73 | #define LAST_CPUPID_SHIFT 0 |
68 | #endif | 74 | #endif |
69 | 75 | ||
70 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | 76 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS |
71 | #define LAST_NID_WIDTH LAST_NID_SHIFT | 77 | #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT |
72 | #else | 78 | #else |
73 | #define LAST_NID_WIDTH 0 | 79 | #define LAST_CPUPID_WIDTH 0 |
74 | #endif | 80 | #endif |
75 | 81 | ||
76 | /* | 82 | /* |
@@ -81,8 +87,8 @@ | |||
81 | #define NODE_NOT_IN_PAGE_FLAGS | 87 | #define NODE_NOT_IN_PAGE_FLAGS |
82 | #endif | 88 | #endif |
83 | 89 | ||
84 | #if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 | 90 | #if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 |
85 | #define LAST_NID_NOT_IN_PAGE_FLAGS | 91 | #define LAST_CPUPID_NOT_IN_PAGE_FLAGS |
86 | #endif | 92 | #endif |
87 | 93 | ||
88 | #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ | 94 | #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ |
diff --git a/include/linux/preempt.h b/include/linux/preempt.h index f5d4723cdb3d..a3d9dc8c2c00 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h | |||
@@ -6,106 +6,95 @@ | |||
6 | * preempt_count (used for kernel preemption, interrupt count, etc.) | 6 | * preempt_count (used for kernel preemption, interrupt count, etc.) |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/thread_info.h> | ||
10 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
11 | #include <linux/list.h> | 10 | #include <linux/list.h> |
12 | 11 | ||
13 | #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) | 12 | /* |
14 | extern void add_preempt_count(int val); | 13 | * We use the MSB mostly because its available; see <linux/preempt_mask.h> for |
15 | extern void sub_preempt_count(int val); | 14 | * the other bits -- can't include that header due to inclusion hell. |
16 | #else | 15 | */ |
17 | # define add_preempt_count(val) do { preempt_count() += (val); } while (0) | 16 | #define PREEMPT_NEED_RESCHED 0x80000000 |
18 | # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) | ||
19 | #endif | ||
20 | |||
21 | #define inc_preempt_count() add_preempt_count(1) | ||
22 | #define dec_preempt_count() sub_preempt_count(1) | ||
23 | |||
24 | #define preempt_count() (current_thread_info()->preempt_count) | ||
25 | |||
26 | #ifdef CONFIG_PREEMPT | ||
27 | |||
28 | asmlinkage void preempt_schedule(void); | ||
29 | |||
30 | #define preempt_check_resched() \ | ||
31 | do { \ | ||
32 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ | ||
33 | preempt_schedule(); \ | ||
34 | } while (0) | ||
35 | |||
36 | #ifdef CONFIG_CONTEXT_TRACKING | ||
37 | 17 | ||
38 | void preempt_schedule_context(void); | 18 | #include <asm/preempt.h> |
39 | 19 | ||
40 | #define preempt_check_resched_context() \ | 20 | #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) |
41 | do { \ | 21 | extern void preempt_count_add(int val); |
42 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ | 22 | extern void preempt_count_sub(int val); |
43 | preempt_schedule_context(); \ | 23 | #define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); }) |
44 | } while (0) | ||
45 | #else | 24 | #else |
25 | #define preempt_count_add(val) __preempt_count_add(val) | ||
26 | #define preempt_count_sub(val) __preempt_count_sub(val) | ||
27 | #define preempt_count_dec_and_test() __preempt_count_dec_and_test() | ||
28 | #endif | ||
46 | 29 | ||
47 | #define preempt_check_resched_context() preempt_check_resched() | 30 | #define __preempt_count_inc() __preempt_count_add(1) |
48 | 31 | #define __preempt_count_dec() __preempt_count_sub(1) | |
49 | #endif /* CONFIG_CONTEXT_TRACKING */ | ||
50 | |||
51 | #else /* !CONFIG_PREEMPT */ | ||
52 | |||
53 | #define preempt_check_resched() do { } while (0) | ||
54 | #define preempt_check_resched_context() do { } while (0) | ||
55 | |||
56 | #endif /* CONFIG_PREEMPT */ | ||
57 | 32 | ||
33 | #define preempt_count_inc() preempt_count_add(1) | ||
34 | #define preempt_count_dec() preempt_count_sub(1) | ||
58 | 35 | ||
59 | #ifdef CONFIG_PREEMPT_COUNT | 36 | #ifdef CONFIG_PREEMPT_COUNT |
60 | 37 | ||
61 | #define preempt_disable() \ | 38 | #define preempt_disable() \ |
62 | do { \ | 39 | do { \ |
63 | inc_preempt_count(); \ | 40 | preempt_count_inc(); \ |
64 | barrier(); \ | 41 | barrier(); \ |
65 | } while (0) | 42 | } while (0) |
66 | 43 | ||
67 | #define sched_preempt_enable_no_resched() \ | 44 | #define sched_preempt_enable_no_resched() \ |
68 | do { \ | 45 | do { \ |
69 | barrier(); \ | 46 | barrier(); \ |
70 | dec_preempt_count(); \ | 47 | preempt_count_dec(); \ |
71 | } while (0) | 48 | } while (0) |
72 | 49 | ||
73 | #define preempt_enable_no_resched() sched_preempt_enable_no_resched() | 50 | #define preempt_enable_no_resched() sched_preempt_enable_no_resched() |
74 | 51 | ||
52 | #ifdef CONFIG_PREEMPT | ||
75 | #define preempt_enable() \ | 53 | #define preempt_enable() \ |
76 | do { \ | 54 | do { \ |
77 | preempt_enable_no_resched(); \ | ||
78 | barrier(); \ | 55 | barrier(); \ |
79 | preempt_check_resched(); \ | 56 | if (unlikely(preempt_count_dec_and_test())) \ |
57 | __preempt_schedule(); \ | ||
58 | } while (0) | ||
59 | |||
60 | #define preempt_check_resched() \ | ||
61 | do { \ | ||
62 | if (should_resched()) \ | ||
63 | __preempt_schedule(); \ | ||
80 | } while (0) | 64 | } while (0) |
81 | 65 | ||
82 | /* For debugging and tracer internals only! */ | 66 | #else |
83 | #define add_preempt_count_notrace(val) \ | 67 | #define preempt_enable() preempt_enable_no_resched() |
84 | do { preempt_count() += (val); } while (0) | 68 | #define preempt_check_resched() do { } while (0) |
85 | #define sub_preempt_count_notrace(val) \ | 69 | #endif |
86 | do { preempt_count() -= (val); } while (0) | ||
87 | #define inc_preempt_count_notrace() add_preempt_count_notrace(1) | ||
88 | #define dec_preempt_count_notrace() sub_preempt_count_notrace(1) | ||
89 | 70 | ||
90 | #define preempt_disable_notrace() \ | 71 | #define preempt_disable_notrace() \ |
91 | do { \ | 72 | do { \ |
92 | inc_preempt_count_notrace(); \ | 73 | __preempt_count_inc(); \ |
93 | barrier(); \ | 74 | barrier(); \ |
94 | } while (0) | 75 | } while (0) |
95 | 76 | ||
96 | #define preempt_enable_no_resched_notrace() \ | 77 | #define preempt_enable_no_resched_notrace() \ |
97 | do { \ | 78 | do { \ |
98 | barrier(); \ | 79 | barrier(); \ |
99 | dec_preempt_count_notrace(); \ | 80 | __preempt_count_dec(); \ |
100 | } while (0) | 81 | } while (0) |
101 | 82 | ||
102 | /* preempt_check_resched is OK to trace */ | 83 | #ifdef CONFIG_PREEMPT |
84 | |||
85 | #ifndef CONFIG_CONTEXT_TRACKING | ||
86 | #define __preempt_schedule_context() __preempt_schedule() | ||
87 | #endif | ||
88 | |||
103 | #define preempt_enable_notrace() \ | 89 | #define preempt_enable_notrace() \ |
104 | do { \ | 90 | do { \ |
105 | preempt_enable_no_resched_notrace(); \ | ||
106 | barrier(); \ | 91 | barrier(); \ |
107 | preempt_check_resched_context(); \ | 92 | if (unlikely(__preempt_count_dec_and_test())) \ |
93 | __preempt_schedule_context(); \ | ||
108 | } while (0) | 94 | } while (0) |
95 | #else | ||
96 | #define preempt_enable_notrace() preempt_enable_no_resched_notrace() | ||
97 | #endif | ||
109 | 98 | ||
110 | #else /* !CONFIG_PREEMPT_COUNT */ | 99 | #else /* !CONFIG_PREEMPT_COUNT */ |
111 | 100 | ||
@@ -115,10 +104,11 @@ do { \ | |||
115 | * that can cause faults and scheduling migrate into our preempt-protected | 104 | * that can cause faults and scheduling migrate into our preempt-protected |
116 | * region. | 105 | * region. |
117 | */ | 106 | */ |
118 | #define preempt_disable() barrier() | 107 | #define preempt_disable() barrier() |
119 | #define sched_preempt_enable_no_resched() barrier() | 108 | #define sched_preempt_enable_no_resched() barrier() |
120 | #define preempt_enable_no_resched() barrier() | 109 | #define preempt_enable_no_resched() barrier() |
121 | #define preempt_enable() barrier() | 110 | #define preempt_enable() barrier() |
111 | #define preempt_check_resched() do { } while (0) | ||
122 | 112 | ||
123 | #define preempt_disable_notrace() barrier() | 113 | #define preempt_disable_notrace() barrier() |
124 | #define preempt_enable_no_resched_notrace() barrier() | 114 | #define preempt_enable_no_resched_notrace() barrier() |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..833eed55cf43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -22,6 +22,7 @@ struct sched_param { | |||
22 | #include <linux/errno.h> | 22 | #include <linux/errno.h> |
23 | #include <linux/nodemask.h> | 23 | #include <linux/nodemask.h> |
24 | #include <linux/mm_types.h> | 24 | #include <linux/mm_types.h> |
25 | #include <linux/preempt.h> | ||
25 | 26 | ||
26 | #include <asm/page.h> | 27 | #include <asm/page.h> |
27 | #include <asm/ptrace.h> | 28 | #include <asm/ptrace.h> |
@@ -427,6 +428,14 @@ struct task_cputime { | |||
427 | .sum_exec_runtime = 0, \ | 428 | .sum_exec_runtime = 0, \ |
428 | } | 429 | } |
429 | 430 | ||
431 | #define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) | ||
432 | |||
433 | #ifdef CONFIG_PREEMPT_COUNT | ||
434 | #define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) | ||
435 | #else | ||
436 | #define PREEMPT_DISABLED PREEMPT_ENABLED | ||
437 | #endif | ||
438 | |||
430 | /* | 439 | /* |
431 | * Disable preemption until the scheduler is running. | 440 | * Disable preemption until the scheduler is running. |
432 | * Reset by start_kernel()->sched_init()->init_idle(). | 441 | * Reset by start_kernel()->sched_init()->init_idle(). |
@@ -434,7 +443,7 @@ struct task_cputime { | |||
434 | * We include PREEMPT_ACTIVE to avoid cond_resched() from working | 443 | * We include PREEMPT_ACTIVE to avoid cond_resched() from working |
435 | * before the scheduler is active -- see should_resched(). | 444 | * before the scheduler is active -- see should_resched(). |
436 | */ | 445 | */ |
437 | #define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) | 446 | #define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) |
438 | 447 | ||
439 | /** | 448 | /** |
440 | * struct thread_group_cputimer - thread group interval timer counts | 449 | * struct thread_group_cputimer - thread group interval timer counts |
@@ -768,6 +777,7 @@ enum cpu_idle_type { | |||
768 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ | 777 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ |
769 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ | 778 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ |
770 | #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ | 779 | #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ |
780 | #define SD_NUMA 0x4000 /* cross-node balancing */ | ||
771 | 781 | ||
772 | extern int __weak arch_sd_sibiling_asym_packing(void); | 782 | extern int __weak arch_sd_sibiling_asym_packing(void); |
773 | 783 | ||
@@ -811,6 +821,10 @@ struct sched_domain { | |||
811 | 821 | ||
812 | u64 last_update; | 822 | u64 last_update; |
813 | 823 | ||
824 | /* idle_balance() stats */ | ||
825 | u64 max_newidle_lb_cost; | ||
826 | unsigned long next_decay_max_lb_cost; | ||
827 | |||
814 | #ifdef CONFIG_SCHEDSTATS | 828 | #ifdef CONFIG_SCHEDSTATS |
815 | /* load_balance() stats */ | 829 | /* load_balance() stats */ |
816 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; | 830 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; |
@@ -1029,6 +1043,8 @@ struct task_struct { | |||
1029 | struct task_struct *last_wakee; | 1043 | struct task_struct *last_wakee; |
1030 | unsigned long wakee_flips; | 1044 | unsigned long wakee_flips; |
1031 | unsigned long wakee_flip_decay_ts; | 1045 | unsigned long wakee_flip_decay_ts; |
1046 | |||
1047 | int wake_cpu; | ||
1032 | #endif | 1048 | #endif |
1033 | int on_rq; | 1049 | int on_rq; |
1034 | 1050 | ||
@@ -1324,10 +1340,41 @@ struct task_struct { | |||
1324 | #endif | 1340 | #endif |
1325 | #ifdef CONFIG_NUMA_BALANCING | 1341 | #ifdef CONFIG_NUMA_BALANCING |
1326 | int numa_scan_seq; | 1342 | int numa_scan_seq; |
1327 | int numa_migrate_seq; | ||
1328 | unsigned int numa_scan_period; | 1343 | unsigned int numa_scan_period; |
1344 | unsigned int numa_scan_period_max; | ||
1345 | int numa_preferred_nid; | ||
1346 | int numa_migrate_deferred; | ||
1347 | unsigned long numa_migrate_retry; | ||
1329 | u64 node_stamp; /* migration stamp */ | 1348 | u64 node_stamp; /* migration stamp */ |
1330 | struct callback_head numa_work; | 1349 | struct callback_head numa_work; |
1350 | |||
1351 | struct list_head numa_entry; | ||
1352 | struct numa_group *numa_group; | ||
1353 | |||
1354 | /* | ||
1355 | * Exponential decaying average of faults on a per-node basis. | ||
1356 | * Scheduling placement decisions are made based on the these counts. | ||
1357 | * The values remain static for the duration of a PTE scan | ||
1358 | */ | ||
1359 | unsigned long *numa_faults; | ||
1360 | unsigned long total_numa_faults; | ||
1361 | |||
1362 | /* | ||
1363 | * numa_faults_buffer records faults per node during the current | ||
1364 | * scan window. When the scan completes, the counts in numa_faults | ||
1365 | * decay and these values are copied. | ||
1366 | */ | ||
1367 | unsigned long *numa_faults_buffer; | ||
1368 | |||
1369 | /* | ||
1370 | * numa_faults_locality tracks if faults recorded during the last | ||
1371 | * scan window were remote/local. The task scan period is adapted | ||
1372 | * based on the locality of the faults with different weights | ||
1373 | * depending on whether they were shared or private faults | ||
1374 | */ | ||
1375 | unsigned long numa_faults_locality[2]; | ||
1376 | |||
1377 | unsigned long numa_pages_migrated; | ||
1331 | #endif /* CONFIG_NUMA_BALANCING */ | 1378 | #endif /* CONFIG_NUMA_BALANCING */ |
1332 | 1379 | ||
1333 | struct rcu_head rcu; | 1380 | struct rcu_head rcu; |
@@ -1413,16 +1460,33 @@ struct task_struct { | |||
1413 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ | 1460 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ |
1414 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) | 1461 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) |
1415 | 1462 | ||
1463 | #define TNF_MIGRATED 0x01 | ||
1464 | #define TNF_NO_GROUP 0x02 | ||
1465 | #define TNF_SHARED 0x04 | ||
1466 | #define TNF_FAULT_LOCAL 0x08 | ||
1467 | |||
1416 | #ifdef CONFIG_NUMA_BALANCING | 1468 | #ifdef CONFIG_NUMA_BALANCING |
1417 | extern void task_numa_fault(int node, int pages, bool migrated); | 1469 | extern void task_numa_fault(int last_node, int node, int pages, int flags); |
1470 | extern pid_t task_numa_group_id(struct task_struct *p); | ||
1418 | extern void set_numabalancing_state(bool enabled); | 1471 | extern void set_numabalancing_state(bool enabled); |
1472 | extern void task_numa_free(struct task_struct *p); | ||
1473 | |||
1474 | extern unsigned int sysctl_numa_balancing_migrate_deferred; | ||
1419 | #else | 1475 | #else |
1420 | static inline void task_numa_fault(int node, int pages, bool migrated) | 1476 | static inline void task_numa_fault(int last_node, int node, int pages, |
1477 | int flags) | ||
1421 | { | 1478 | { |
1422 | } | 1479 | } |
1480 | static inline pid_t task_numa_group_id(struct task_struct *p) | ||
1481 | { | ||
1482 | return 0; | ||
1483 | } | ||
1423 | static inline void set_numabalancing_state(bool enabled) | 1484 | static inline void set_numabalancing_state(bool enabled) |
1424 | { | 1485 | { |
1425 | } | 1486 | } |
1487 | static inline void task_numa_free(struct task_struct *p) | ||
1488 | { | ||
1489 | } | ||
1426 | #endif | 1490 | #endif |
1427 | 1491 | ||
1428 | static inline struct pid *task_pid(struct task_struct *task) | 1492 | static inline struct pid *task_pid(struct task_struct *task) |
@@ -1975,7 +2039,7 @@ extern void wake_up_new_task(struct task_struct *tsk); | |||
1975 | #else | 2039 | #else |
1976 | static inline void kick_process(struct task_struct *tsk) { } | 2040 | static inline void kick_process(struct task_struct *tsk) { } |
1977 | #endif | 2041 | #endif |
1978 | extern void sched_fork(struct task_struct *p); | 2042 | extern void sched_fork(unsigned long clone_flags, struct task_struct *p); |
1979 | extern void sched_dead(struct task_struct *p); | 2043 | extern void sched_dead(struct task_struct *p); |
1980 | 2044 | ||
1981 | extern void proc_caches_init(void); | 2045 | extern void proc_caches_init(void); |
@@ -2402,11 +2466,6 @@ static inline int signal_pending_state(long state, struct task_struct *p) | |||
2402 | return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); | 2466 | return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); |
2403 | } | 2467 | } |
2404 | 2468 | ||
2405 | static inline int need_resched(void) | ||
2406 | { | ||
2407 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); | ||
2408 | } | ||
2409 | |||
2410 | /* | 2469 | /* |
2411 | * cond_resched() and cond_resched_lock(): latency reduction via | 2470 | * cond_resched() and cond_resched_lock(): latency reduction via |
2412 | * explicit rescheduling in places that are safe. The return | 2471 | * explicit rescheduling in places that are safe. The return |
@@ -2475,36 +2534,105 @@ static inline int tsk_is_polling(struct task_struct *p) | |||
2475 | { | 2534 | { |
2476 | return task_thread_info(p)->status & TS_POLLING; | 2535 | return task_thread_info(p)->status & TS_POLLING; |
2477 | } | 2536 | } |
2478 | static inline void current_set_polling(void) | 2537 | static inline void __current_set_polling(void) |
2479 | { | 2538 | { |
2480 | current_thread_info()->status |= TS_POLLING; | 2539 | current_thread_info()->status |= TS_POLLING; |
2481 | } | 2540 | } |
2482 | 2541 | ||
2483 | static inline void current_clr_polling(void) | 2542 | static inline bool __must_check current_set_polling_and_test(void) |
2543 | { | ||
2544 | __current_set_polling(); | ||
2545 | |||
2546 | /* | ||
2547 | * Polling state must be visible before we test NEED_RESCHED, | ||
2548 | * paired by resched_task() | ||
2549 | */ | ||
2550 | smp_mb(); | ||
2551 | |||
2552 | return unlikely(tif_need_resched()); | ||
2553 | } | ||
2554 | |||
2555 | static inline void __current_clr_polling(void) | ||
2484 | { | 2556 | { |
2485 | current_thread_info()->status &= ~TS_POLLING; | 2557 | current_thread_info()->status &= ~TS_POLLING; |
2486 | smp_mb__after_clear_bit(); | 2558 | } |
2559 | |||
2560 | static inline bool __must_check current_clr_polling_and_test(void) | ||
2561 | { | ||
2562 | __current_clr_polling(); | ||
2563 | |||
2564 | /* | ||
2565 | * Polling state must be visible before we test NEED_RESCHED, | ||
2566 | * paired by resched_task() | ||
2567 | */ | ||
2568 | smp_mb(); | ||
2569 | |||
2570 | return unlikely(tif_need_resched()); | ||
2487 | } | 2571 | } |
2488 | #elif defined(TIF_POLLING_NRFLAG) | 2572 | #elif defined(TIF_POLLING_NRFLAG) |
2489 | static inline int tsk_is_polling(struct task_struct *p) | 2573 | static inline int tsk_is_polling(struct task_struct *p) |
2490 | { | 2574 | { |
2491 | return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); | 2575 | return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); |
2492 | } | 2576 | } |
2493 | static inline void current_set_polling(void) | 2577 | |
2578 | static inline void __current_set_polling(void) | ||
2494 | { | 2579 | { |
2495 | set_thread_flag(TIF_POLLING_NRFLAG); | 2580 | set_thread_flag(TIF_POLLING_NRFLAG); |
2496 | } | 2581 | } |
2497 | 2582 | ||
2498 | static inline void current_clr_polling(void) | 2583 | static inline bool __must_check current_set_polling_and_test(void) |
2584 | { | ||
2585 | __current_set_polling(); | ||
2586 | |||
2587 | /* | ||
2588 | * Polling state must be visible before we test NEED_RESCHED, | ||
2589 | * paired by resched_task() | ||
2590 | * | ||
2591 | * XXX: assumes set/clear bit are identical barrier wise. | ||
2592 | */ | ||
2593 | smp_mb__after_clear_bit(); | ||
2594 | |||
2595 | return unlikely(tif_need_resched()); | ||
2596 | } | ||
2597 | |||
2598 | static inline void __current_clr_polling(void) | ||
2499 | { | 2599 | { |
2500 | clear_thread_flag(TIF_POLLING_NRFLAG); | 2600 | clear_thread_flag(TIF_POLLING_NRFLAG); |
2501 | } | 2601 | } |
2602 | |||
2603 | static inline bool __must_check current_clr_polling_and_test(void) | ||
2604 | { | ||
2605 | __current_clr_polling(); | ||
2606 | |||
2607 | /* | ||
2608 | * Polling state must be visible before we test NEED_RESCHED, | ||
2609 | * paired by resched_task() | ||
2610 | */ | ||
2611 | smp_mb__after_clear_bit(); | ||
2612 | |||
2613 | return unlikely(tif_need_resched()); | ||
2614 | } | ||
2615 | |||
2502 | #else | 2616 | #else |
2503 | static inline int tsk_is_polling(struct task_struct *p) { return 0; } | 2617 | static inline int tsk_is_polling(struct task_struct *p) { return 0; } |
2504 | static inline void current_set_polling(void) { } | 2618 | static inline void __current_set_polling(void) { } |
2505 | static inline void current_clr_polling(void) { } | 2619 | static inline void __current_clr_polling(void) { } |
2620 | |||
2621 | static inline bool __must_check current_set_polling_and_test(void) | ||
2622 | { | ||
2623 | return unlikely(tif_need_resched()); | ||
2624 | } | ||
2625 | static inline bool __must_check current_clr_polling_and_test(void) | ||
2626 | { | ||
2627 | return unlikely(tif_need_resched()); | ||
2628 | } | ||
2506 | #endif | 2629 | #endif |
2507 | 2630 | ||
2631 | static __always_inline bool need_resched(void) | ||
2632 | { | ||
2633 | return unlikely(tif_need_resched()); | ||
2634 | } | ||
2635 | |||
2508 | /* | 2636 | /* |
2509 | * Thread group CPU time accounting. | 2637 | * Thread group CPU time accounting. |
2510 | */ | 2638 | */ |
@@ -2546,6 +2674,11 @@ static inline unsigned int task_cpu(const struct task_struct *p) | |||
2546 | return task_thread_info(p)->cpu; | 2674 | return task_thread_info(p)->cpu; |
2547 | } | 2675 | } |
2548 | 2676 | ||
2677 | static inline int task_node(const struct task_struct *p) | ||
2678 | { | ||
2679 | return cpu_to_node(task_cpu(p)); | ||
2680 | } | ||
2681 | |||
2549 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); | 2682 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); |
2550 | 2683 | ||
2551 | #else | 2684 | #else |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index bf8086b2506e..10d16c4fbe89 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; | |||
47 | extern unsigned int sysctl_numa_balancing_scan_delay; | 47 | extern unsigned int sysctl_numa_balancing_scan_delay; |
48 | extern unsigned int sysctl_numa_balancing_scan_period_min; | 48 | extern unsigned int sysctl_numa_balancing_scan_period_min; |
49 | extern unsigned int sysctl_numa_balancing_scan_period_max; | 49 | extern unsigned int sysctl_numa_balancing_scan_period_max; |
50 | extern unsigned int sysctl_numa_balancing_scan_period_reset; | ||
51 | extern unsigned int sysctl_numa_balancing_scan_size; | 50 | extern unsigned int sysctl_numa_balancing_scan_size; |
52 | extern unsigned int sysctl_numa_balancing_settle_count; | 51 | extern unsigned int sysctl_numa_balancing_settle_count; |
53 | 52 | ||
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3b5e910d14ca..d2abbdb8c6aa 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h | |||
@@ -28,6 +28,7 @@ struct cpu_stop_work { | |||
28 | }; | 28 | }; |
29 | 29 | ||
30 | int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); | 30 | int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); |
31 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); | ||
31 | void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | 32 | void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, |
32 | struct cpu_stop_work *work_buf); | 33 | struct cpu_stop_work *work_buf); |
33 | int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | 34 | int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); |
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e7e04736802f..fddbe2023a5d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h | |||
@@ -104,8 +104,21 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) | |||
104 | #define test_thread_flag(flag) \ | 104 | #define test_thread_flag(flag) \ |
105 | test_ti_thread_flag(current_thread_info(), flag) | 105 | test_ti_thread_flag(current_thread_info(), flag) |
106 | 106 | ||
107 | #define set_need_resched() set_thread_flag(TIF_NEED_RESCHED) | 107 | static inline __deprecated void set_need_resched(void) |
108 | #define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED) | 108 | { |
109 | /* | ||
110 | * Use of this function in deprecated. | ||
111 | * | ||
112 | * As of this writing there are only a few users in the DRM tree left | ||
113 | * all of which are wrong and can be removed without causing too much | ||
114 | * grief. | ||
115 | * | ||
116 | * The DRM people are aware and are working on removing the last few | ||
117 | * instances. | ||
118 | */ | ||
119 | } | ||
120 | |||
121 | #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) | ||
109 | 122 | ||
110 | #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK | 123 | #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK |
111 | /* | 124 | /* |
diff --git a/include/linux/topology.h b/include/linux/topology.h index d3cf0d6e7712..12ae6ce997d6 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -106,6 +106,8 @@ int arch_update_cpu_topology(void); | |||
106 | .last_balance = jiffies, \ | 106 | .last_balance = jiffies, \ |
107 | .balance_interval = 1, \ | 107 | .balance_interval = 1, \ |
108 | .smt_gain = 1178, /* 15% */ \ | 108 | .smt_gain = 1178, /* 15% */ \ |
109 | .max_newidle_lb_cost = 0, \ | ||
110 | .next_decay_max_lb_cost = jiffies, \ | ||
109 | } | 111 | } |
110 | #endif | 112 | #endif |
111 | #endif /* CONFIG_SCHED_SMT */ | 113 | #endif /* CONFIG_SCHED_SMT */ |
@@ -135,6 +137,8 @@ int arch_update_cpu_topology(void); | |||
135 | , \ | 137 | , \ |
136 | .last_balance = jiffies, \ | 138 | .last_balance = jiffies, \ |
137 | .balance_interval = 1, \ | 139 | .balance_interval = 1, \ |
140 | .max_newidle_lb_cost = 0, \ | ||
141 | .next_decay_max_lb_cost = jiffies, \ | ||
138 | } | 142 | } |
139 | #endif | 143 | #endif |
140 | #endif /* CONFIG_SCHED_MC */ | 144 | #endif /* CONFIG_SCHED_MC */ |
@@ -166,6 +170,8 @@ int arch_update_cpu_topology(void); | |||
166 | , \ | 170 | , \ |
167 | .last_balance = jiffies, \ | 171 | .last_balance = jiffies, \ |
168 | .balance_interval = 1, \ | 172 | .balance_interval = 1, \ |
173 | .max_newidle_lb_cost = 0, \ | ||
174 | .next_decay_max_lb_cost = jiffies, \ | ||
169 | } | 175 | } |
170 | #endif | 176 | #endif |
171 | 177 | ||
diff --git a/include/linux/tty.h b/include/linux/tty.h index 64f864651d86..633cac77f9f9 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h | |||
@@ -672,31 +672,17 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty, | |||
672 | #define wait_event_interruptible_tty(tty, wq, condition) \ | 672 | #define wait_event_interruptible_tty(tty, wq, condition) \ |
673 | ({ \ | 673 | ({ \ |
674 | int __ret = 0; \ | 674 | int __ret = 0; \ |
675 | if (!(condition)) { \ | 675 | if (!(condition)) \ |
676 | __wait_event_interruptible_tty(tty, wq, condition, __ret); \ | 676 | __ret = __wait_event_interruptible_tty(tty, wq, \ |
677 | } \ | 677 | condition); \ |
678 | __ret; \ | 678 | __ret; \ |
679 | }) | 679 | }) |
680 | 680 | ||
681 | #define __wait_event_interruptible_tty(tty, wq, condition, ret) \ | 681 | #define __wait_event_interruptible_tty(tty, wq, condition) \ |
682 | do { \ | 682 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ |
683 | DEFINE_WAIT(__wait); \ | 683 | tty_unlock(tty); \ |
684 | \ | ||
685 | for (;;) { \ | ||
686 | prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ | ||
687 | if (condition) \ | ||
688 | break; \ | ||
689 | if (!signal_pending(current)) { \ | ||
690 | tty_unlock(tty); \ | ||
691 | schedule(); \ | 684 | schedule(); \ |
692 | tty_lock(tty); \ | 685 | tty_lock(tty)) |
693 | continue; \ | ||
694 | } \ | ||
695 | ret = -ERESTARTSYS; \ | ||
696 | break; \ | ||
697 | } \ | ||
698 | finish_wait(&wq, &__wait); \ | ||
699 | } while (0) | ||
700 | 686 | ||
701 | #ifdef CONFIG_PROC_FS | 687 | #ifdef CONFIG_PROC_FS |
702 | extern void proc_tty_register_driver(struct tty_driver *); | 688 | extern void proc_tty_register_driver(struct tty_driver *); |
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5ca0951e1855..9d8cf056e661 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h | |||
@@ -15,7 +15,7 @@ | |||
15 | */ | 15 | */ |
16 | static inline void pagefault_disable(void) | 16 | static inline void pagefault_disable(void) |
17 | { | 17 | { |
18 | inc_preempt_count(); | 18 | preempt_count_inc(); |
19 | /* | 19 | /* |
20 | * make sure to have issued the store before a pagefault | 20 | * make sure to have issued the store before a pagefault |
21 | * can hit. | 21 | * can hit. |
@@ -30,11 +30,7 @@ static inline void pagefault_enable(void) | |||
30 | * the pagefault handler again. | 30 | * the pagefault handler again. |
31 | */ | 31 | */ |
32 | barrier(); | 32 | barrier(); |
33 | dec_preempt_count(); | 33 | preempt_count_dec(); |
34 | /* | ||
35 | * make sure we do.. | ||
36 | */ | ||
37 | barrier(); | ||
38 | preempt_check_resched(); | 34 | preempt_check_resched(); |
39 | } | 35 | } |
40 | 36 | ||
diff --git a/include/linux/wait.h b/include/linux/wait.h index a67fc1635592..a2726c7dd244 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -1,7 +1,8 @@ | |||
1 | #ifndef _LINUX_WAIT_H | 1 | #ifndef _LINUX_WAIT_H |
2 | #define _LINUX_WAIT_H | 2 | #define _LINUX_WAIT_H |
3 | 3 | /* | |
4 | 4 | * Linux wait queue related types and methods | |
5 | */ | ||
5 | #include <linux/list.h> | 6 | #include <linux/list.h> |
6 | #include <linux/stddef.h> | 7 | #include <linux/stddef.h> |
7 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
@@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v | |||
13 | int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); | 14 | int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); |
14 | 15 | ||
15 | struct __wait_queue { | 16 | struct __wait_queue { |
16 | unsigned int flags; | 17 | unsigned int flags; |
17 | #define WQ_FLAG_EXCLUSIVE 0x01 | 18 | #define WQ_FLAG_EXCLUSIVE 0x01 |
18 | void *private; | 19 | void *private; |
19 | wait_queue_func_t func; | 20 | wait_queue_func_t func; |
20 | struct list_head task_list; | 21 | struct list_head task_list; |
21 | }; | 22 | }; |
22 | 23 | ||
23 | struct wait_bit_key { | 24 | struct wait_bit_key { |
24 | void *flags; | 25 | void *flags; |
25 | int bit_nr; | 26 | int bit_nr; |
26 | #define WAIT_ATOMIC_T_BIT_NR -1 | 27 | #define WAIT_ATOMIC_T_BIT_NR -1 |
27 | }; | 28 | }; |
28 | 29 | ||
29 | struct wait_bit_queue { | 30 | struct wait_bit_queue { |
30 | struct wait_bit_key key; | 31 | struct wait_bit_key key; |
31 | wait_queue_t wait; | 32 | wait_queue_t wait; |
32 | }; | 33 | }; |
33 | 34 | ||
34 | struct __wait_queue_head { | 35 | struct __wait_queue_head { |
35 | spinlock_t lock; | 36 | spinlock_t lock; |
36 | struct list_head task_list; | 37 | struct list_head task_list; |
37 | }; | 38 | }; |
38 | typedef struct __wait_queue_head wait_queue_head_t; | 39 | typedef struct __wait_queue_head wait_queue_head_t; |
39 | 40 | ||
@@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct | |||
84 | 85 | ||
85 | static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) | 86 | static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) |
86 | { | 87 | { |
87 | q->flags = 0; | 88 | q->flags = 0; |
88 | q->private = p; | 89 | q->private = p; |
89 | q->func = default_wake_function; | 90 | q->func = default_wake_function; |
90 | } | 91 | } |
91 | 92 | ||
92 | static inline void init_waitqueue_func_entry(wait_queue_t *q, | 93 | static inline void |
93 | wait_queue_func_t func) | 94 | init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) |
94 | { | 95 | { |
95 | q->flags = 0; | 96 | q->flags = 0; |
96 | q->private = NULL; | 97 | q->private = NULL; |
97 | q->func = func; | 98 | q->func = func; |
98 | } | 99 | } |
99 | 100 | ||
100 | static inline int waitqueue_active(wait_queue_head_t *q) | 101 | static inline int waitqueue_active(wait_queue_head_t *q) |
@@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) | |||
114 | /* | 115 | /* |
115 | * Used for wake-one threads: | 116 | * Used for wake-one threads: |
116 | */ | 117 | */ |
117 | static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, | 118 | static inline void |
118 | wait_queue_t *wait) | 119 | __add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) |
119 | { | 120 | { |
120 | wait->flags |= WQ_FLAG_EXCLUSIVE; | 121 | wait->flags |= WQ_FLAG_EXCLUSIVE; |
121 | __add_wait_queue(q, wait); | 122 | __add_wait_queue(q, wait); |
@@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head, | |||
127 | list_add_tail(&new->task_list, &head->task_list); | 128 | list_add_tail(&new->task_list, &head->task_list); |
128 | } | 129 | } |
129 | 130 | ||
130 | static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, | 131 | static inline void |
131 | wait_queue_t *wait) | 132 | __add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) |
132 | { | 133 | { |
133 | wait->flags |= WQ_FLAG_EXCLUSIVE; | 134 | wait->flags |= WQ_FLAG_EXCLUSIVE; |
134 | __add_wait_queue_tail(q, wait); | 135 | __add_wait_queue_tail(q, wait); |
135 | } | 136 | } |
136 | 137 | ||
137 | static inline void __remove_wait_queue(wait_queue_head_t *head, | 138 | static inline void |
138 | wait_queue_t *old) | 139 | __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) |
139 | { | 140 | { |
140 | list_del(&old->task_list); | 141 | list_del(&old->task_list); |
141 | } | 142 | } |
142 | 143 | ||
143 | void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); | 144 | void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); |
144 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); | 145 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); |
145 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, | 146 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); |
146 | void *key); | ||
147 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); | 147 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); |
148 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); | 148 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); |
149 | void __wake_up_bit(wait_queue_head_t *, void *, int); | 149 | void __wake_up_bit(wait_queue_head_t *, void *, int); |
@@ -170,27 +170,61 @@ wait_queue_head_t *bit_waitqueue(void *, int); | |||
170 | /* | 170 | /* |
171 | * Wakeup macros to be used to report events to the targets. | 171 | * Wakeup macros to be used to report events to the targets. |
172 | */ | 172 | */ |
173 | #define wake_up_poll(x, m) \ | 173 | #define wake_up_poll(x, m) \ |
174 | __wake_up(x, TASK_NORMAL, 1, (void *) (m)) | 174 | __wake_up(x, TASK_NORMAL, 1, (void *) (m)) |
175 | #define wake_up_locked_poll(x, m) \ | 175 | #define wake_up_locked_poll(x, m) \ |
176 | __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) | 176 | __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) |
177 | #define wake_up_interruptible_poll(x, m) \ | 177 | #define wake_up_interruptible_poll(x, m) \ |
178 | __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) | 178 | __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) |
179 | #define wake_up_interruptible_sync_poll(x, m) \ | 179 | #define wake_up_interruptible_sync_poll(x, m) \ |
180 | __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) | 180 | __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) |
181 | 181 | ||
182 | #define __wait_event(wq, condition) \ | 182 | #define ___wait_cond_timeout(condition) \ |
183 | do { \ | 183 | ({ \ |
184 | bool __cond = (condition); \ | ||
185 | if (__cond && !__ret) \ | ||
186 | __ret = 1; \ | ||
187 | __cond || !__ret; \ | ||
188 | }) | ||
189 | |||
190 | #define ___wait_signal_pending(state) \ | ||
191 | ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \ | ||
192 | (state == TASK_KILLABLE && fatal_signal_pending(current))) | ||
193 | |||
194 | #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ | ||
195 | ({ \ | ||
196 | __label__ __out; \ | ||
184 | DEFINE_WAIT(__wait); \ | 197 | DEFINE_WAIT(__wait); \ |
198 | long __ret = ret; \ | ||
185 | \ | 199 | \ |
186 | for (;;) { \ | 200 | for (;;) { \ |
187 | prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ | 201 | if (exclusive) \ |
202 | prepare_to_wait_exclusive(&wq, &__wait, state); \ | ||
203 | else \ | ||
204 | prepare_to_wait(&wq, &__wait, state); \ | ||
205 | \ | ||
188 | if (condition) \ | 206 | if (condition) \ |
189 | break; \ | 207 | break; \ |
190 | schedule(); \ | 208 | \ |
209 | if (___wait_signal_pending(state)) { \ | ||
210 | __ret = -ERESTARTSYS; \ | ||
211 | if (exclusive) { \ | ||
212 | abort_exclusive_wait(&wq, &__wait, \ | ||
213 | state, NULL); \ | ||
214 | goto __out; \ | ||
215 | } \ | ||
216 | break; \ | ||
217 | } \ | ||
218 | \ | ||
219 | cmd; \ | ||
191 | } \ | 220 | } \ |
192 | finish_wait(&wq, &__wait); \ | 221 | finish_wait(&wq, &__wait); \ |
193 | } while (0) | 222 | __out: __ret; \ |
223 | }) | ||
224 | |||
225 | #define __wait_event(wq, condition) \ | ||
226 | (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ | ||
227 | schedule()) | ||
194 | 228 | ||
195 | /** | 229 | /** |
196 | * wait_event - sleep until a condition gets true | 230 | * wait_event - sleep until a condition gets true |
@@ -204,29 +238,17 @@ do { \ | |||
204 | * wake_up() has to be called after changing any variable that could | 238 | * wake_up() has to be called after changing any variable that could |
205 | * change the result of the wait condition. | 239 | * change the result of the wait condition. |
206 | */ | 240 | */ |
207 | #define wait_event(wq, condition) \ | 241 | #define wait_event(wq, condition) \ |
208 | do { \ | 242 | do { \ |
209 | if (condition) \ | 243 | if (condition) \ |
210 | break; \ | 244 | break; \ |
211 | __wait_event(wq, condition); \ | 245 | __wait_event(wq, condition); \ |
212 | } while (0) | 246 | } while (0) |
213 | 247 | ||
214 | #define __wait_event_timeout(wq, condition, ret) \ | 248 | #define __wait_event_timeout(wq, condition, timeout) \ |
215 | do { \ | 249 | ___wait_event(wq, ___wait_cond_timeout(condition), \ |
216 | DEFINE_WAIT(__wait); \ | 250 | TASK_UNINTERRUPTIBLE, 0, timeout, \ |
217 | \ | 251 | __ret = schedule_timeout(__ret)) |
218 | for (;;) { \ | ||
219 | prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ | ||
220 | if (condition) \ | ||
221 | break; \ | ||
222 | ret = schedule_timeout(ret); \ | ||
223 | if (!ret) \ | ||
224 | break; \ | ||
225 | } \ | ||
226 | if (!ret && (condition)) \ | ||
227 | ret = 1; \ | ||
228 | finish_wait(&wq, &__wait); \ | ||
229 | } while (0) | ||
230 | 252 | ||
231 | /** | 253 | /** |
232 | * wait_event_timeout - sleep until a condition gets true or a timeout elapses | 254 | * wait_event_timeout - sleep until a condition gets true or a timeout elapses |
@@ -248,28 +270,14 @@ do { \ | |||
248 | #define wait_event_timeout(wq, condition, timeout) \ | 270 | #define wait_event_timeout(wq, condition, timeout) \ |
249 | ({ \ | 271 | ({ \ |
250 | long __ret = timeout; \ | 272 | long __ret = timeout; \ |
251 | if (!(condition)) \ | 273 | if (!(condition)) \ |
252 | __wait_event_timeout(wq, condition, __ret); \ | 274 | __ret = __wait_event_timeout(wq, condition, timeout); \ |
253 | __ret; \ | 275 | __ret; \ |
254 | }) | 276 | }) |
255 | 277 | ||
256 | #define __wait_event_interruptible(wq, condition, ret) \ | 278 | #define __wait_event_interruptible(wq, condition) \ |
257 | do { \ | 279 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ |
258 | DEFINE_WAIT(__wait); \ | 280 | schedule()) |
259 | \ | ||
260 | for (;;) { \ | ||
261 | prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ | ||
262 | if (condition) \ | ||
263 | break; \ | ||
264 | if (!signal_pending(current)) { \ | ||
265 | schedule(); \ | ||
266 | continue; \ | ||
267 | } \ | ||
268 | ret = -ERESTARTSYS; \ | ||
269 | break; \ | ||
270 | } \ | ||
271 | finish_wait(&wq, &__wait); \ | ||
272 | } while (0) | ||
273 | 281 | ||
274 | /** | 282 | /** |
275 | * wait_event_interruptible - sleep until a condition gets true | 283 | * wait_event_interruptible - sleep until a condition gets true |
@@ -290,31 +298,14 @@ do { \ | |||
290 | ({ \ | 298 | ({ \ |
291 | int __ret = 0; \ | 299 | int __ret = 0; \ |
292 | if (!(condition)) \ | 300 | if (!(condition)) \ |
293 | __wait_event_interruptible(wq, condition, __ret); \ | 301 | __ret = __wait_event_interruptible(wq, condition); \ |
294 | __ret; \ | 302 | __ret; \ |
295 | }) | 303 | }) |
296 | 304 | ||
297 | #define __wait_event_interruptible_timeout(wq, condition, ret) \ | 305 | #define __wait_event_interruptible_timeout(wq, condition, timeout) \ |
298 | do { \ | 306 | ___wait_event(wq, ___wait_cond_timeout(condition), \ |
299 | DEFINE_WAIT(__wait); \ | 307 | TASK_INTERRUPTIBLE, 0, timeout, \ |
300 | \ | 308 | __ret = schedule_timeout(__ret)) |
301 | for (;;) { \ | ||
302 | prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ | ||
303 | if (condition) \ | ||
304 | break; \ | ||
305 | if (!signal_pending(current)) { \ | ||
306 | ret = schedule_timeout(ret); \ | ||
307 | if (!ret) \ | ||
308 | break; \ | ||
309 | continue; \ | ||
310 | } \ | ||
311 | ret = -ERESTARTSYS; \ | ||
312 | break; \ | ||
313 | } \ | ||
314 | if (!ret && (condition)) \ | ||
315 | ret = 1; \ | ||
316 | finish_wait(&wq, &__wait); \ | ||
317 | } while (0) | ||
318 | 309 | ||
319 | /** | 310 | /** |
320 | * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses | 311 | * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses |
@@ -338,14 +329,14 @@ do { \ | |||
338 | ({ \ | 329 | ({ \ |
339 | long __ret = timeout; \ | 330 | long __ret = timeout; \ |
340 | if (!(condition)) \ | 331 | if (!(condition)) \ |
341 | __wait_event_interruptible_timeout(wq, condition, __ret); \ | 332 | __ret = __wait_event_interruptible_timeout(wq, \ |
333 | condition, timeout); \ | ||
342 | __ret; \ | 334 | __ret; \ |
343 | }) | 335 | }) |
344 | 336 | ||
345 | #define __wait_event_hrtimeout(wq, condition, timeout, state) \ | 337 | #define __wait_event_hrtimeout(wq, condition, timeout, state) \ |
346 | ({ \ | 338 | ({ \ |
347 | int __ret = 0; \ | 339 | int __ret = 0; \ |
348 | DEFINE_WAIT(__wait); \ | ||
349 | struct hrtimer_sleeper __t; \ | 340 | struct hrtimer_sleeper __t; \ |
350 | \ | 341 | \ |
351 | hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ | 342 | hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ |
@@ -356,25 +347,15 @@ do { \ | |||
356 | current->timer_slack_ns, \ | 347 | current->timer_slack_ns, \ |
357 | HRTIMER_MODE_REL); \ | 348 | HRTIMER_MODE_REL); \ |
358 | \ | 349 | \ |
359 | for (;;) { \ | 350 | __ret = ___wait_event(wq, condition, state, 0, 0, \ |
360 | prepare_to_wait(&wq, &__wait, state); \ | ||
361 | if (condition) \ | ||
362 | break; \ | ||
363 | if (state == TASK_INTERRUPTIBLE && \ | ||
364 | signal_pending(current)) { \ | ||
365 | __ret = -ERESTARTSYS; \ | ||
366 | break; \ | ||
367 | } \ | ||
368 | if (!__t.task) { \ | 351 | if (!__t.task) { \ |
369 | __ret = -ETIME; \ | 352 | __ret = -ETIME; \ |
370 | break; \ | 353 | break; \ |
371 | } \ | 354 | } \ |
372 | schedule(); \ | 355 | schedule()); \ |
373 | } \ | ||
374 | \ | 356 | \ |
375 | hrtimer_cancel(&__t.timer); \ | 357 | hrtimer_cancel(&__t.timer); \ |
376 | destroy_hrtimer_on_stack(&__t.timer); \ | 358 | destroy_hrtimer_on_stack(&__t.timer); \ |
377 | finish_wait(&wq, &__wait); \ | ||
378 | __ret; \ | 359 | __ret; \ |
379 | }) | 360 | }) |
380 | 361 | ||
@@ -428,33 +409,15 @@ do { \ | |||
428 | __ret; \ | 409 | __ret; \ |
429 | }) | 410 | }) |
430 | 411 | ||
431 | #define __wait_event_interruptible_exclusive(wq, condition, ret) \ | 412 | #define __wait_event_interruptible_exclusive(wq, condition) \ |
432 | do { \ | 413 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ |
433 | DEFINE_WAIT(__wait); \ | 414 | schedule()) |
434 | \ | ||
435 | for (;;) { \ | ||
436 | prepare_to_wait_exclusive(&wq, &__wait, \ | ||
437 | TASK_INTERRUPTIBLE); \ | ||
438 | if (condition) { \ | ||
439 | finish_wait(&wq, &__wait); \ | ||
440 | break; \ | ||
441 | } \ | ||
442 | if (!signal_pending(current)) { \ | ||
443 | schedule(); \ | ||
444 | continue; \ | ||
445 | } \ | ||
446 | ret = -ERESTARTSYS; \ | ||
447 | abort_exclusive_wait(&wq, &__wait, \ | ||
448 | TASK_INTERRUPTIBLE, NULL); \ | ||
449 | break; \ | ||
450 | } \ | ||
451 | } while (0) | ||
452 | 415 | ||
453 | #define wait_event_interruptible_exclusive(wq, condition) \ | 416 | #define wait_event_interruptible_exclusive(wq, condition) \ |
454 | ({ \ | 417 | ({ \ |
455 | int __ret = 0; \ | 418 | int __ret = 0; \ |
456 | if (!(condition)) \ | 419 | if (!(condition)) \ |
457 | __wait_event_interruptible_exclusive(wq, condition, __ret);\ | 420 | __ret = __wait_event_interruptible_exclusive(wq, condition);\ |
458 | __ret; \ | 421 | __ret; \ |
459 | }) | 422 | }) |
460 | 423 | ||
@@ -606,24 +569,8 @@ do { \ | |||
606 | ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) | 569 | ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) |
607 | 570 | ||
608 | 571 | ||
609 | 572 | #define __wait_event_killable(wq, condition) \ | |
610 | #define __wait_event_killable(wq, condition, ret) \ | 573 | ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) |
611 | do { \ | ||
612 | DEFINE_WAIT(__wait); \ | ||
613 | \ | ||
614 | for (;;) { \ | ||
615 | prepare_to_wait(&wq, &__wait, TASK_KILLABLE); \ | ||
616 | if (condition) \ | ||
617 | break; \ | ||
618 | if (!fatal_signal_pending(current)) { \ | ||
619 | schedule(); \ | ||
620 | continue; \ | ||
621 | } \ | ||
622 | ret = -ERESTARTSYS; \ | ||
623 | break; \ | ||
624 | } \ | ||
625 | finish_wait(&wq, &__wait); \ | ||
626 | } while (0) | ||
627 | 574 | ||
628 | /** | 575 | /** |
629 | * wait_event_killable - sleep until a condition gets true | 576 | * wait_event_killable - sleep until a condition gets true |
@@ -644,26 +591,17 @@ do { \ | |||
644 | ({ \ | 591 | ({ \ |
645 | int __ret = 0; \ | 592 | int __ret = 0; \ |
646 | if (!(condition)) \ | 593 | if (!(condition)) \ |
647 | __wait_event_killable(wq, condition, __ret); \ | 594 | __ret = __wait_event_killable(wq, condition); \ |
648 | __ret; \ | 595 | __ret; \ |
649 | }) | 596 | }) |
650 | 597 | ||
651 | 598 | ||
652 | #define __wait_event_lock_irq(wq, condition, lock, cmd) \ | 599 | #define __wait_event_lock_irq(wq, condition, lock, cmd) \ |
653 | do { \ | 600 | (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ |
654 | DEFINE_WAIT(__wait); \ | 601 | spin_unlock_irq(&lock); \ |
655 | \ | 602 | cmd; \ |
656 | for (;;) { \ | 603 | schedule(); \ |
657 | prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ | 604 | spin_lock_irq(&lock)) |
658 | if (condition) \ | ||
659 | break; \ | ||
660 | spin_unlock_irq(&lock); \ | ||
661 | cmd; \ | ||
662 | schedule(); \ | ||
663 | spin_lock_irq(&lock); \ | ||
664 | } \ | ||
665 | finish_wait(&wq, &__wait); \ | ||
666 | } while (0) | ||
667 | 605 | ||
668 | /** | 606 | /** |
669 | * wait_event_lock_irq_cmd - sleep until a condition gets true. The | 607 | * wait_event_lock_irq_cmd - sleep until a condition gets true. The |
@@ -723,26 +661,12 @@ do { \ | |||
723 | } while (0) | 661 | } while (0) |
724 | 662 | ||
725 | 663 | ||
726 | #define __wait_event_interruptible_lock_irq(wq, condition, \ | 664 | #define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \ |
727 | lock, ret, cmd) \ | 665 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ |
728 | do { \ | 666 | spin_unlock_irq(&lock); \ |
729 | DEFINE_WAIT(__wait); \ | 667 | cmd; \ |
730 | \ | 668 | schedule(); \ |
731 | for (;;) { \ | 669 | spin_lock_irq(&lock)) |
732 | prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ | ||
733 | if (condition) \ | ||
734 | break; \ | ||
735 | if (signal_pending(current)) { \ | ||
736 | ret = -ERESTARTSYS; \ | ||
737 | break; \ | ||
738 | } \ | ||
739 | spin_unlock_irq(&lock); \ | ||
740 | cmd; \ | ||
741 | schedule(); \ | ||
742 | spin_lock_irq(&lock); \ | ||
743 | } \ | ||
744 | finish_wait(&wq, &__wait); \ | ||
745 | } while (0) | ||
746 | 670 | ||
747 | /** | 671 | /** |
748 | * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. | 672 | * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. |
@@ -772,10 +696,9 @@ do { \ | |||
772 | #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ | 696 | #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ |
773 | ({ \ | 697 | ({ \ |
774 | int __ret = 0; \ | 698 | int __ret = 0; \ |
775 | \ | ||
776 | if (!(condition)) \ | 699 | if (!(condition)) \ |
777 | __wait_event_interruptible_lock_irq(wq, condition, \ | 700 | __ret = __wait_event_interruptible_lock_irq(wq, \ |
778 | lock, __ret, cmd); \ | 701 | condition, lock, cmd); \ |
779 | __ret; \ | 702 | __ret; \ |
780 | }) | 703 | }) |
781 | 704 | ||
@@ -804,39 +727,24 @@ do { \ | |||
804 | #define wait_event_interruptible_lock_irq(wq, condition, lock) \ | 727 | #define wait_event_interruptible_lock_irq(wq, condition, lock) \ |
805 | ({ \ | 728 | ({ \ |
806 | int __ret = 0; \ | 729 | int __ret = 0; \ |
807 | \ | ||
808 | if (!(condition)) \ | 730 | if (!(condition)) \ |
809 | __wait_event_interruptible_lock_irq(wq, condition, \ | 731 | __ret = __wait_event_interruptible_lock_irq(wq, \ |
810 | lock, __ret, ); \ | 732 | condition, lock,) \ |
811 | __ret; \ | 733 | __ret; \ |
812 | }) | 734 | }) |
813 | 735 | ||
814 | #define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ | 736 | #define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ |
815 | lock, ret) \ | 737 | lock, timeout) \ |
816 | do { \ | 738 | ___wait_event(wq, ___wait_cond_timeout(condition), \ |
817 | DEFINE_WAIT(__wait); \ | 739 | TASK_INTERRUPTIBLE, 0, ret, \ |
818 | \ | 740 | spin_unlock_irq(&lock); \ |
819 | for (;;) { \ | 741 | __ret = schedule_timeout(__ret); \ |
820 | prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ | 742 | spin_lock_irq(&lock)); |
821 | if (condition) \ | ||
822 | break; \ | ||
823 | if (signal_pending(current)) { \ | ||
824 | ret = -ERESTARTSYS; \ | ||
825 | break; \ | ||
826 | } \ | ||
827 | spin_unlock_irq(&lock); \ | ||
828 | ret = schedule_timeout(ret); \ | ||
829 | spin_lock_irq(&lock); \ | ||
830 | if (!ret) \ | ||
831 | break; \ | ||
832 | } \ | ||
833 | finish_wait(&wq, &__wait); \ | ||
834 | } while (0) | ||
835 | 743 | ||
836 | /** | 744 | /** |
837 | * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. | 745 | * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets |
838 | * The condition is checked under the lock. This is expected | 746 | * true or a timeout elapses. The condition is checked under |
839 | * to be called with the lock taken. | 747 | * the lock. This is expected to be called with the lock taken. |
840 | * @wq: the waitqueue to wait on | 748 | * @wq: the waitqueue to wait on |
841 | * @condition: a C expression for the event to wait for | 749 | * @condition: a C expression for the event to wait for |
842 | * @lock: a locked spinlock_t, which will be released before schedule() | 750 | * @lock: a locked spinlock_t, which will be released before schedule() |
@@ -860,11 +768,10 @@ do { \ | |||
860 | #define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ | 768 | #define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ |
861 | timeout) \ | 769 | timeout) \ |
862 | ({ \ | 770 | ({ \ |
863 | int __ret = timeout; \ | 771 | long __ret = timeout; \ |
864 | \ | ||
865 | if (!(condition)) \ | 772 | if (!(condition)) \ |
866 | __wait_event_interruptible_lock_irq_timeout( \ | 773 | __ret = __wait_event_interruptible_lock_irq_timeout( \ |
867 | wq, condition, lock, __ret); \ | 774 | wq, condition, lock, timeout); \ |
868 | __ret; \ | 775 | __ret; \ |
869 | }) | 776 | }) |
870 | 777 | ||
@@ -875,11 +782,9 @@ do { \ | |||
875 | * We plan to remove these interfaces. | 782 | * We plan to remove these interfaces. |
876 | */ | 783 | */ |
877 | extern void sleep_on(wait_queue_head_t *q); | 784 | extern void sleep_on(wait_queue_head_t *q); |
878 | extern long sleep_on_timeout(wait_queue_head_t *q, | 785 | extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout); |
879 | signed long timeout); | ||
880 | extern void interruptible_sleep_on(wait_queue_head_t *q); | 786 | extern void interruptible_sleep_on(wait_queue_head_t *q); |
881 | extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, | 787 | extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout); |
882 | signed long timeout); | ||
883 | 788 | ||
884 | /* | 789 | /* |
885 | * Waitqueues which are removed from the waitqueue_head at wakeup time | 790 | * Waitqueues which are removed from the waitqueue_head at wakeup time |
@@ -887,8 +792,7 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, | |||
887 | void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); | 792 | void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); |
888 | void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); | 793 | void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); |
889 | void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); | 794 | void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); |
890 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | 795 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); |
891 | unsigned int mode, void *key); | ||
892 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); | 796 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); |
893 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); | 797 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); |
894 | 798 | ||
@@ -934,8 +838,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); | |||
934 | * One uses wait_on_bit() where one is waiting for the bit to clear, | 838 | * One uses wait_on_bit() where one is waiting for the bit to clear, |
935 | * but has no intention of setting it. | 839 | * but has no intention of setting it. |
936 | */ | 840 | */ |
937 | static inline int wait_on_bit(void *word, int bit, | 841 | static inline int |
938 | int (*action)(void *), unsigned mode) | 842 | wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) |
939 | { | 843 | { |
940 | if (!test_bit(bit, word)) | 844 | if (!test_bit(bit, word)) |
941 | return 0; | 845 | return 0; |
@@ -958,8 +862,8 @@ static inline int wait_on_bit(void *word, int bit, | |||
958 | * One uses wait_on_bit_lock() where one is waiting for the bit to | 862 | * One uses wait_on_bit_lock() where one is waiting for the bit to |
959 | * clear with the intention of setting it, and when done, clearing it. | 863 | * clear with the intention of setting it, and when done, clearing it. |
960 | */ | 864 | */ |
961 | static inline int wait_on_bit_lock(void *word, int bit, | 865 | static inline int |
962 | int (*action)(void *), unsigned mode) | 866 | wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode) |
963 | { | 867 | { |
964 | if (!test_and_set_bit(bit, word)) | 868 | if (!test_and_set_bit(bit, word)) |
965 | return 0; | 869 | return 0; |
@@ -983,5 +887,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) | |||
983 | return 0; | 887 | return 0; |
984 | return out_of_line_wait_on_atomic_t(val, action, mode); | 888 | return out_of_line_wait_on_atomic_t(val, action, mode); |
985 | } | 889 | } |
986 | 890 | ||
987 | #endif | 891 | #endif /* _LINUX_WAIT_H */ |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 2e7d9947a10d..613381bcde40 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) | |||
100 | /* | 100 | /* |
101 | * For all intents and purposes a preempted task is a running task. | 101 | * For all intents and purposes a preempted task is a running task. |
102 | */ | 102 | */ |
103 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) | 103 | if (task_preempt_count(p) & PREEMPT_ACTIVE) |
104 | state = TASK_RUNNING | TASK_STATE_MAX; | 104 | state = TASK_RUNNING | TASK_STATE_MAX; |
105 | #endif | 105 | #endif |
106 | 106 | ||
diff --git a/init/main.c b/init/main.c index 63d3e8f2970c..379090fadac9 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -693,7 +693,7 @@ int __init_or_module do_one_initcall(initcall_t fn) | |||
693 | 693 | ||
694 | if (preempt_count() != count) { | 694 | if (preempt_count() != count) { |
695 | sprintf(msgbuf, "preemption imbalance "); | 695 | sprintf(msgbuf, "preemption imbalance "); |
696 | preempt_count() = count; | 696 | preempt_count_set(count); |
697 | } | 697 | } |
698 | if (irqs_disabled()) { | 698 | if (irqs_disabled()) { |
699 | strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); | 699 | strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..e8ca97b5c386 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
12 | #include <linux/page_cgroup.h> | 12 | #include <linux/page_cgroup.h> |
13 | #include <linux/log2.h> | ||
13 | 14 | ||
14 | void foo(void) | 15 | void foo(void) |
15 | { | 16 | { |
@@ -17,5 +18,8 @@ void foo(void) | |||
17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 18 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 19 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
19 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | 20 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); |
21 | #ifdef CONFIG_SMP | ||
22 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); | ||
23 | #endif | ||
20 | /* End of constants */ | 24 | /* End of constants */ |
21 | } | 25 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a1..e5f3917aa05b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void) | |||
120 | * instead of preempt_schedule() to exit user context if needed before | 120 | * instead of preempt_schedule() to exit user context if needed before |
121 | * calling the scheduler. | 121 | * calling the scheduler. |
122 | */ | 122 | */ |
123 | void __sched notrace preempt_schedule_context(void) | 123 | asmlinkage void __sched notrace preempt_schedule_context(void) |
124 | { | 124 | { |
125 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
126 | 126 | ||
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..988573a9a387 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) | |||
44 | rcu_idle_enter(); | 44 | rcu_idle_enter(); |
45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
46 | local_irq_enable(); | 46 | local_irq_enable(); |
47 | while (!need_resched()) | 47 | while (!tif_need_resched()) |
48 | cpu_relax(); | 48 | cpu_relax(); |
49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
50 | rcu_idle_exit(); | 50 | rcu_idle_exit(); |
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void) | |||
92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { | 92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { |
93 | cpu_idle_poll(); | 93 | cpu_idle_poll(); |
94 | } else { | 94 | } else { |
95 | current_clr_polling(); | 95 | if (!current_clr_polling_and_test()) { |
96 | if (!need_resched()) { | ||
97 | stop_critical_timings(); | 96 | stop_critical_timings(); |
98 | rcu_idle_enter(); | 97 | rcu_idle_enter(); |
99 | arch_cpu_idle(); | 98 | arch_cpu_idle(); |
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void) | |||
103 | } else { | 102 | } else { |
104 | local_irq_enable(); | 103 | local_irq_enable(); |
105 | } | 104 | } |
106 | current_set_polling(); | 105 | __current_set_polling(); |
107 | } | 106 | } |
108 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
108 | /* | ||
109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
110 | * bit here because we might not have send the | ||
111 | * reschedule IPI to idle tasks. | ||
112 | */ | ||
113 | if (tif_need_resched()) | ||
114 | set_preempt_need_resched(); | ||
109 | } | 115 | } |
110 | tick_nohz_idle_exit(); | 116 | tick_nohz_idle_exit(); |
111 | schedule_preempt_disabled(); | 117 | schedule_preempt_disabled(); |
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
129 | */ | 135 | */ |
130 | boot_init_stack_canary(); | 136 | boot_init_stack_canary(); |
131 | #endif | 137 | #endif |
132 | current_set_polling(); | 138 | __current_set_polling(); |
133 | arch_cpu_idle_prepare(); | 139 | arch_cpu_idle_prepare(); |
134 | cpu_idle_loop(); | 140 | cpu_idle_loop(); |
135 | } | 141 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6bd..c93be06dee87 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
817 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 817 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
818 | mm->pmd_huge_pte = NULL; | 818 | mm->pmd_huge_pte = NULL; |
819 | #endif | 819 | #endif |
820 | #ifdef CONFIG_NUMA_BALANCING | ||
821 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
822 | #endif | ||
823 | if (!mm_init(mm, tsk)) | 820 | if (!mm_init(mm, tsk)) |
824 | goto fail_nomem; | 821 | goto fail_nomem; |
825 | 822 | ||
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1313 | #endif | 1310 | #endif |
1314 | 1311 | ||
1315 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1312 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1316 | sched_fork(p); | 1313 | sched_fork(clone_flags, p); |
1317 | 1314 | ||
1318 | retval = perf_event_init_task(p); | 1315 | retval = perf_event_init_task(p); |
1319 | if (retval) | 1316 | if (retval) |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e6..1dc9f3604ad8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -898,6 +898,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
898 | force_quiescent_state(rsp); /* Kick them all. */ | 898 | force_quiescent_state(rsp); /* Kick them all. */ |
899 | } | 899 | } |
900 | 900 | ||
901 | /* | ||
902 | * This function really isn't for public consumption, but RCU is special in | ||
903 | * that context switches can allow the state machine to make progress. | ||
904 | */ | ||
905 | extern void resched_cpu(int cpu); | ||
906 | |||
901 | static void print_cpu_stall(struct rcu_state *rsp) | 907 | static void print_cpu_stall(struct rcu_state *rsp) |
902 | { | 908 | { |
903 | int cpu; | 909 | int cpu; |
@@ -927,7 +933,14 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
927 | 3 * rcu_jiffies_till_stall_check() + 3; | 933 | 3 * rcu_jiffies_till_stall_check() + 3; |
928 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 934 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
929 | 935 | ||
930 | set_need_resched(); /* kick ourselves to get things going. */ | 936 | /* |
937 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
938 | * | ||
939 | * A context switch would normally allow the RCU state machine to make | ||
940 | * progress and it could be we're stuck in kernel space without context | ||
941 | * switches for an entirely unreasonable amount of time. | ||
942 | */ | ||
943 | resched_cpu(smp_processor_id()); | ||
931 | } | 944 | } |
932 | 945 | ||
933 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 946 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..0c3feebcf112 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -513,12 +513,11 @@ static inline void init_hrtick(void) | |||
513 | * might also involve a cross-CPU call to trigger the scheduler on | 513 | * might also involve a cross-CPU call to trigger the scheduler on |
514 | * the target CPU. | 514 | * the target CPU. |
515 | */ | 515 | */ |
516 | #ifdef CONFIG_SMP | ||
517 | void resched_task(struct task_struct *p) | 516 | void resched_task(struct task_struct *p) |
518 | { | 517 | { |
519 | int cpu; | 518 | int cpu; |
520 | 519 | ||
521 | assert_raw_spin_locked(&task_rq(p)->lock); | 520 | lockdep_assert_held(&task_rq(p)->lock); |
522 | 521 | ||
523 | if (test_tsk_need_resched(p)) | 522 | if (test_tsk_need_resched(p)) |
524 | return; | 523 | return; |
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) | |||
526 | set_tsk_need_resched(p); | 525 | set_tsk_need_resched(p); |
527 | 526 | ||
528 | cpu = task_cpu(p); | 527 | cpu = task_cpu(p); |
529 | if (cpu == smp_processor_id()) | 528 | if (cpu == smp_processor_id()) { |
529 | set_preempt_need_resched(); | ||
530 | return; | 530 | return; |
531 | } | ||
531 | 532 | ||
532 | /* NEED_RESCHED must be visible before we test polling */ | 533 | /* NEED_RESCHED must be visible before we test polling */ |
533 | smp_mb(); | 534 | smp_mb(); |
@@ -546,6 +547,7 @@ void resched_cpu(int cpu) | |||
546 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 547 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
547 | } | 548 | } |
548 | 549 | ||
550 | #ifdef CONFIG_SMP | ||
549 | #ifdef CONFIG_NO_HZ_COMMON | 551 | #ifdef CONFIG_NO_HZ_COMMON |
550 | /* | 552 | /* |
551 | * In the semi idle case, use the nearest busy cpu for migrating timers | 553 | * In the semi idle case, use the nearest busy cpu for migrating timers |
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) | |||
693 | } | 695 | } |
694 | } | 696 | } |
695 | 697 | ||
696 | #else /* !CONFIG_SMP */ | ||
697 | void resched_task(struct task_struct *p) | ||
698 | { | ||
699 | assert_raw_spin_locked(&task_rq(p)->lock); | ||
700 | set_tsk_need_resched(p); | ||
701 | } | ||
702 | #endif /* CONFIG_SMP */ | 698 | #endif /* CONFIG_SMP */ |
703 | 699 | ||
704 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 700 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) | |||
767 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 763 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
768 | { | 764 | { |
769 | update_rq_clock(rq); | 765 | update_rq_clock(rq); |
770 | sched_info_queued(p); | 766 | sched_info_queued(rq, p); |
771 | p->sched_class->enqueue_task(rq, p, flags); | 767 | p->sched_class->enqueue_task(rq, p, flags); |
772 | } | 768 | } |
773 | 769 | ||
774 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 770 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
775 | { | 771 | { |
776 | update_rq_clock(rq); | 772 | update_rq_clock(rq); |
777 | sched_info_dequeued(p); | 773 | sched_info_dequeued(rq, p); |
778 | p->sched_class->dequeue_task(rq, p, flags); | 774 | p->sched_class->dequeue_task(rq, p, flags); |
779 | } | 775 | } |
780 | 776 | ||
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
987 | * ttwu() will sort out the placement. | 983 | * ttwu() will sort out the placement. |
988 | */ | 984 | */ |
989 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 985 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
990 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 986 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); |
991 | 987 | ||
992 | #ifdef CONFIG_LOCKDEP | 988 | #ifdef CONFIG_LOCKDEP |
993 | /* | 989 | /* |
@@ -1017,6 +1013,102 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1017 | __set_task_cpu(p, new_cpu); | 1013 | __set_task_cpu(p, new_cpu); |
1018 | } | 1014 | } |
1019 | 1015 | ||
1016 | static void __migrate_swap_task(struct task_struct *p, int cpu) | ||
1017 | { | ||
1018 | if (p->on_rq) { | ||
1019 | struct rq *src_rq, *dst_rq; | ||
1020 | |||
1021 | src_rq = task_rq(p); | ||
1022 | dst_rq = cpu_rq(cpu); | ||
1023 | |||
1024 | deactivate_task(src_rq, p, 0); | ||
1025 | set_task_cpu(p, cpu); | ||
1026 | activate_task(dst_rq, p, 0); | ||
1027 | check_preempt_curr(dst_rq, p, 0); | ||
1028 | } else { | ||
1029 | /* | ||
1030 | * Task isn't running anymore; make it appear like we migrated | ||
1031 | * it before it went to sleep. This means on wakeup we make the | ||
1032 | * previous cpu our targer instead of where it really is. | ||
1033 | */ | ||
1034 | p->wake_cpu = cpu; | ||
1035 | } | ||
1036 | } | ||
1037 | |||
1038 | struct migration_swap_arg { | ||
1039 | struct task_struct *src_task, *dst_task; | ||
1040 | int src_cpu, dst_cpu; | ||
1041 | }; | ||
1042 | |||
1043 | static int migrate_swap_stop(void *data) | ||
1044 | { | ||
1045 | struct migration_swap_arg *arg = data; | ||
1046 | struct rq *src_rq, *dst_rq; | ||
1047 | int ret = -EAGAIN; | ||
1048 | |||
1049 | src_rq = cpu_rq(arg->src_cpu); | ||
1050 | dst_rq = cpu_rq(arg->dst_cpu); | ||
1051 | |||
1052 | double_rq_lock(src_rq, dst_rq); | ||
1053 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | ||
1054 | goto unlock; | ||
1055 | |||
1056 | if (task_cpu(arg->src_task) != arg->src_cpu) | ||
1057 | goto unlock; | ||
1058 | |||
1059 | if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) | ||
1060 | goto unlock; | ||
1061 | |||
1062 | if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) | ||
1063 | goto unlock; | ||
1064 | |||
1065 | __migrate_swap_task(arg->src_task, arg->dst_cpu); | ||
1066 | __migrate_swap_task(arg->dst_task, arg->src_cpu); | ||
1067 | |||
1068 | ret = 0; | ||
1069 | |||
1070 | unlock: | ||
1071 | double_rq_unlock(src_rq, dst_rq); | ||
1072 | |||
1073 | return ret; | ||
1074 | } | ||
1075 | |||
1076 | /* | ||
1077 | * Cross migrate two tasks | ||
1078 | */ | ||
1079 | int migrate_swap(struct task_struct *cur, struct task_struct *p) | ||
1080 | { | ||
1081 | struct migration_swap_arg arg; | ||
1082 | int ret = -EINVAL; | ||
1083 | |||
1084 | get_online_cpus(); | ||
1085 | |||
1086 | arg = (struct migration_swap_arg){ | ||
1087 | .src_task = cur, | ||
1088 | .src_cpu = task_cpu(cur), | ||
1089 | .dst_task = p, | ||
1090 | .dst_cpu = task_cpu(p), | ||
1091 | }; | ||
1092 | |||
1093 | if (arg.src_cpu == arg.dst_cpu) | ||
1094 | goto out; | ||
1095 | |||
1096 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | ||
1097 | goto out; | ||
1098 | |||
1099 | if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) | ||
1100 | goto out; | ||
1101 | |||
1102 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | ||
1103 | goto out; | ||
1104 | |||
1105 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | ||
1106 | |||
1107 | out: | ||
1108 | put_online_cpus(); | ||
1109 | return ret; | ||
1110 | } | ||
1111 | |||
1020 | struct migration_arg { | 1112 | struct migration_arg { |
1021 | struct task_struct *task; | 1113 | struct task_struct *task; |
1022 | int dest_cpu; | 1114 | int dest_cpu; |
@@ -1236,9 +1328,9 @@ out: | |||
1236 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 1328 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
1237 | */ | 1329 | */ |
1238 | static inline | 1330 | static inline |
1239 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 1331 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
1240 | { | 1332 | { |
1241 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | 1333 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
1242 | 1334 | ||
1243 | /* | 1335 | /* |
1244 | * In order not to call set_task_cpu() on a blocking task we need | 1336 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -1330,12 +1422,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
1330 | 1422 | ||
1331 | if (rq->idle_stamp) { | 1423 | if (rq->idle_stamp) { |
1332 | u64 delta = rq_clock(rq) - rq->idle_stamp; | 1424 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
1333 | u64 max = 2*sysctl_sched_migration_cost; | 1425 | u64 max = 2*rq->max_idle_balance_cost; |
1334 | 1426 | ||
1335 | if (delta > max) | 1427 | update_avg(&rq->avg_idle, delta); |
1428 | |||
1429 | if (rq->avg_idle > max) | ||
1336 | rq->avg_idle = max; | 1430 | rq->avg_idle = max; |
1337 | else | 1431 | |
1338 | update_avg(&rq->avg_idle, delta); | ||
1339 | rq->idle_stamp = 0; | 1432 | rq->idle_stamp = 0; |
1340 | } | 1433 | } |
1341 | #endif | 1434 | #endif |
@@ -1396,6 +1489,14 @@ static void sched_ttwu_pending(void) | |||
1396 | 1489 | ||
1397 | void scheduler_ipi(void) | 1490 | void scheduler_ipi(void) |
1398 | { | 1491 | { |
1492 | /* | ||
1493 | * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting | ||
1494 | * TIF_NEED_RESCHED remotely (for the first time) will also send | ||
1495 | * this IPI. | ||
1496 | */ | ||
1497 | if (tif_need_resched()) | ||
1498 | set_preempt_need_resched(); | ||
1499 | |||
1399 | if (llist_empty(&this_rq()->wake_list) | 1500 | if (llist_empty(&this_rq()->wake_list) |
1400 | && !tick_nohz_full_cpu(smp_processor_id()) | 1501 | && !tick_nohz_full_cpu(smp_processor_id()) |
1401 | && !got_nohz_idle_kick()) | 1502 | && !got_nohz_idle_kick()) |
@@ -1513,7 +1614,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1513 | if (p->sched_class->task_waking) | 1614 | if (p->sched_class->task_waking) |
1514 | p->sched_class->task_waking(p); | 1615 | p->sched_class->task_waking(p); |
1515 | 1616 | ||
1516 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 1617 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
1517 | if (task_cpu(p) != cpu) { | 1618 | if (task_cpu(p) != cpu) { |
1518 | wake_flags |= WF_MIGRATED; | 1619 | wake_flags |= WF_MIGRATED; |
1519 | set_task_cpu(p, cpu); | 1620 | set_task_cpu(p, cpu); |
@@ -1595,7 +1696,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
1595 | * | 1696 | * |
1596 | * __sched_fork() is basic setup used by init_idle() too: | 1697 | * __sched_fork() is basic setup used by init_idle() too: |
1597 | */ | 1698 | */ |
1598 | static void __sched_fork(struct task_struct *p) | 1699 | static void __sched_fork(unsigned long clone_flags, struct task_struct *p) |
1599 | { | 1700 | { |
1600 | p->on_rq = 0; | 1701 | p->on_rq = 0; |
1601 | 1702 | ||
@@ -1619,16 +1720,24 @@ static void __sched_fork(struct task_struct *p) | |||
1619 | 1720 | ||
1620 | #ifdef CONFIG_NUMA_BALANCING | 1721 | #ifdef CONFIG_NUMA_BALANCING |
1621 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | 1722 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { |
1622 | p->mm->numa_next_scan = jiffies; | 1723 | p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
1623 | p->mm->numa_next_reset = jiffies; | ||
1624 | p->mm->numa_scan_seq = 0; | 1724 | p->mm->numa_scan_seq = 0; |
1625 | } | 1725 | } |
1626 | 1726 | ||
1727 | if (clone_flags & CLONE_VM) | ||
1728 | p->numa_preferred_nid = current->numa_preferred_nid; | ||
1729 | else | ||
1730 | p->numa_preferred_nid = -1; | ||
1731 | |||
1627 | p->node_stamp = 0ULL; | 1732 | p->node_stamp = 0ULL; |
1628 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1733 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
1629 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1630 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1734 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
1631 | p->numa_work.next = &p->numa_work; | 1735 | p->numa_work.next = &p->numa_work; |
1736 | p->numa_faults = NULL; | ||
1737 | p->numa_faults_buffer = NULL; | ||
1738 | |||
1739 | INIT_LIST_HEAD(&p->numa_entry); | ||
1740 | p->numa_group = NULL; | ||
1632 | #endif /* CONFIG_NUMA_BALANCING */ | 1741 | #endif /* CONFIG_NUMA_BALANCING */ |
1633 | } | 1742 | } |
1634 | 1743 | ||
@@ -1654,12 +1763,12 @@ void set_numabalancing_state(bool enabled) | |||
1654 | /* | 1763 | /* |
1655 | * fork()/clone()-time setup: | 1764 | * fork()/clone()-time setup: |
1656 | */ | 1765 | */ |
1657 | void sched_fork(struct task_struct *p) | 1766 | void sched_fork(unsigned long clone_flags, struct task_struct *p) |
1658 | { | 1767 | { |
1659 | unsigned long flags; | 1768 | unsigned long flags; |
1660 | int cpu = get_cpu(); | 1769 | int cpu = get_cpu(); |
1661 | 1770 | ||
1662 | __sched_fork(p); | 1771 | __sched_fork(clone_flags, p); |
1663 | /* | 1772 | /* |
1664 | * We mark the process as running here. This guarantees that | 1773 | * We mark the process as running here. This guarantees that |
1665 | * nobody will actually run it, and a signal or other external | 1774 | * nobody will actually run it, and a signal or other external |
@@ -1717,10 +1826,7 @@ void sched_fork(struct task_struct *p) | |||
1717 | #if defined(CONFIG_SMP) | 1826 | #if defined(CONFIG_SMP) |
1718 | p->on_cpu = 0; | 1827 | p->on_cpu = 0; |
1719 | #endif | 1828 | #endif |
1720 | #ifdef CONFIG_PREEMPT_COUNT | 1829 | init_task_preempt_count(p); |
1721 | /* Want to start with kernel preemption disabled. */ | ||
1722 | task_thread_info(p)->preempt_count = 1; | ||
1723 | #endif | ||
1724 | #ifdef CONFIG_SMP | 1830 | #ifdef CONFIG_SMP |
1725 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1831 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
1726 | #endif | 1832 | #endif |
@@ -1747,7 +1853,7 @@ void wake_up_new_task(struct task_struct *p) | |||
1747 | * - cpus_allowed can change in the fork path | 1853 | * - cpus_allowed can change in the fork path |
1748 | * - any previously selected cpu might disappear through hotplug | 1854 | * - any previously selected cpu might disappear through hotplug |
1749 | */ | 1855 | */ |
1750 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1856 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
1751 | #endif | 1857 | #endif |
1752 | 1858 | ||
1753 | /* Initialize new task's runnable average */ | 1859 | /* Initialize new task's runnable average */ |
@@ -1838,7 +1944,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1838 | struct task_struct *next) | 1944 | struct task_struct *next) |
1839 | { | 1945 | { |
1840 | trace_sched_switch(prev, next); | 1946 | trace_sched_switch(prev, next); |
1841 | sched_info_switch(prev, next); | 1947 | sched_info_switch(rq, prev, next); |
1842 | perf_event_task_sched_out(prev, next); | 1948 | perf_event_task_sched_out(prev, next); |
1843 | fire_sched_out_preempt_notifiers(prev, next); | 1949 | fire_sched_out_preempt_notifiers(prev, next); |
1844 | prepare_lock_switch(rq, next); | 1950 | prepare_lock_switch(rq, next); |
@@ -1890,6 +1996,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1890 | if (mm) | 1996 | if (mm) |
1891 | mmdrop(mm); | 1997 | mmdrop(mm); |
1892 | if (unlikely(prev_state == TASK_DEAD)) { | 1998 | if (unlikely(prev_state == TASK_DEAD)) { |
1999 | task_numa_free(prev); | ||
2000 | |||
1893 | /* | 2001 | /* |
1894 | * Remove function-return probe instances associated with this | 2002 | * Remove function-return probe instances associated with this |
1895 | * task and put them back on the free list. | 2003 | * task and put them back on the free list. |
@@ -2073,7 +2181,7 @@ void sched_exec(void) | |||
2073 | int dest_cpu; | 2181 | int dest_cpu; |
2074 | 2182 | ||
2075 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2183 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2076 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); | 2184 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); |
2077 | if (dest_cpu == smp_processor_id()) | 2185 | if (dest_cpu == smp_processor_id()) |
2078 | goto unlock; | 2186 | goto unlock; |
2079 | 2187 | ||
@@ -2215,7 +2323,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) | |||
2215 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2323 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
2216 | defined(CONFIG_PREEMPT_TRACER)) | 2324 | defined(CONFIG_PREEMPT_TRACER)) |
2217 | 2325 | ||
2218 | void __kprobes add_preempt_count(int val) | 2326 | void __kprobes preempt_count_add(int val) |
2219 | { | 2327 | { |
2220 | #ifdef CONFIG_DEBUG_PREEMPT | 2328 | #ifdef CONFIG_DEBUG_PREEMPT |
2221 | /* | 2329 | /* |
@@ -2224,7 +2332,7 @@ void __kprobes add_preempt_count(int val) | |||
2224 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 2332 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
2225 | return; | 2333 | return; |
2226 | #endif | 2334 | #endif |
2227 | preempt_count() += val; | 2335 | __preempt_count_add(val); |
2228 | #ifdef CONFIG_DEBUG_PREEMPT | 2336 | #ifdef CONFIG_DEBUG_PREEMPT |
2229 | /* | 2337 | /* |
2230 | * Spinlock count overflowing soon? | 2338 | * Spinlock count overflowing soon? |
@@ -2235,9 +2343,9 @@ void __kprobes add_preempt_count(int val) | |||
2235 | if (preempt_count() == val) | 2343 | if (preempt_count() == val) |
2236 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2344 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
2237 | } | 2345 | } |
2238 | EXPORT_SYMBOL(add_preempt_count); | 2346 | EXPORT_SYMBOL(preempt_count_add); |
2239 | 2347 | ||
2240 | void __kprobes sub_preempt_count(int val) | 2348 | void __kprobes preempt_count_sub(int val) |
2241 | { | 2349 | { |
2242 | #ifdef CONFIG_DEBUG_PREEMPT | 2350 | #ifdef CONFIG_DEBUG_PREEMPT |
2243 | /* | 2351 | /* |
@@ -2255,9 +2363,9 @@ void __kprobes sub_preempt_count(int val) | |||
2255 | 2363 | ||
2256 | if (preempt_count() == val) | 2364 | if (preempt_count() == val) |
2257 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2365 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
2258 | preempt_count() -= val; | 2366 | __preempt_count_sub(val); |
2259 | } | 2367 | } |
2260 | EXPORT_SYMBOL(sub_preempt_count); | 2368 | EXPORT_SYMBOL(preempt_count_sub); |
2261 | 2369 | ||
2262 | #endif | 2370 | #endif |
2263 | 2371 | ||
@@ -2430,6 +2538,7 @@ need_resched: | |||
2430 | put_prev_task(rq, prev); | 2538 | put_prev_task(rq, prev); |
2431 | next = pick_next_task(rq); | 2539 | next = pick_next_task(rq); |
2432 | clear_tsk_need_resched(prev); | 2540 | clear_tsk_need_resched(prev); |
2541 | clear_preempt_need_resched(); | ||
2433 | rq->skip_clock_update = 0; | 2542 | rq->skip_clock_update = 0; |
2434 | 2543 | ||
2435 | if (likely(prev != next)) { | 2544 | if (likely(prev != next)) { |
@@ -2520,9 +2629,9 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
2520 | return; | 2629 | return; |
2521 | 2630 | ||
2522 | do { | 2631 | do { |
2523 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 2632 | __preempt_count_add(PREEMPT_ACTIVE); |
2524 | __schedule(); | 2633 | __schedule(); |
2525 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 2634 | __preempt_count_sub(PREEMPT_ACTIVE); |
2526 | 2635 | ||
2527 | /* | 2636 | /* |
2528 | * Check again in case we missed a preemption opportunity | 2637 | * Check again in case we missed a preemption opportunity |
@@ -2541,20 +2650,19 @@ EXPORT_SYMBOL(preempt_schedule); | |||
2541 | */ | 2650 | */ |
2542 | asmlinkage void __sched preempt_schedule_irq(void) | 2651 | asmlinkage void __sched preempt_schedule_irq(void) |
2543 | { | 2652 | { |
2544 | struct thread_info *ti = current_thread_info(); | ||
2545 | enum ctx_state prev_state; | 2653 | enum ctx_state prev_state; |
2546 | 2654 | ||
2547 | /* Catch callers which need to be fixed */ | 2655 | /* Catch callers which need to be fixed */ |
2548 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 2656 | BUG_ON(preempt_count() || !irqs_disabled()); |
2549 | 2657 | ||
2550 | prev_state = exception_enter(); | 2658 | prev_state = exception_enter(); |
2551 | 2659 | ||
2552 | do { | 2660 | do { |
2553 | add_preempt_count(PREEMPT_ACTIVE); | 2661 | __preempt_count_add(PREEMPT_ACTIVE); |
2554 | local_irq_enable(); | 2662 | local_irq_enable(); |
2555 | __schedule(); | 2663 | __schedule(); |
2556 | local_irq_disable(); | 2664 | local_irq_disable(); |
2557 | sub_preempt_count(PREEMPT_ACTIVE); | 2665 | __preempt_count_sub(PREEMPT_ACTIVE); |
2558 | 2666 | ||
2559 | /* | 2667 | /* |
2560 | * Check again in case we missed a preemption opportunity | 2668 | * Check again in case we missed a preemption opportunity |
@@ -3794,16 +3902,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
3794 | return 0; | 3902 | return 0; |
3795 | } | 3903 | } |
3796 | 3904 | ||
3797 | static inline int should_resched(void) | ||
3798 | { | ||
3799 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); | ||
3800 | } | ||
3801 | |||
3802 | static void __cond_resched(void) | 3905 | static void __cond_resched(void) |
3803 | { | 3906 | { |
3804 | add_preempt_count(PREEMPT_ACTIVE); | 3907 | __preempt_count_add(PREEMPT_ACTIVE); |
3805 | __schedule(); | 3908 | __schedule(); |
3806 | sub_preempt_count(PREEMPT_ACTIVE); | 3909 | __preempt_count_sub(PREEMPT_ACTIVE); |
3807 | } | 3910 | } |
3808 | 3911 | ||
3809 | int __sched _cond_resched(void) | 3912 | int __sched _cond_resched(void) |
@@ -4186,7 +4289,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4186 | 4289 | ||
4187 | raw_spin_lock_irqsave(&rq->lock, flags); | 4290 | raw_spin_lock_irqsave(&rq->lock, flags); |
4188 | 4291 | ||
4189 | __sched_fork(idle); | 4292 | __sched_fork(0, idle); |
4190 | idle->state = TASK_RUNNING; | 4293 | idle->state = TASK_RUNNING; |
4191 | idle->se.exec_start = sched_clock(); | 4294 | idle->se.exec_start = sched_clock(); |
4192 | 4295 | ||
@@ -4212,7 +4315,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4212 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 4315 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
4213 | 4316 | ||
4214 | /* Set the preempt count _outside_ the spinlocks! */ | 4317 | /* Set the preempt count _outside_ the spinlocks! */ |
4215 | task_thread_info(idle)->preempt_count = 0; | 4318 | init_idle_preempt_count(idle, cpu); |
4216 | 4319 | ||
4217 | /* | 4320 | /* |
4218 | * The idle tasks have their own, simple scheduling class: | 4321 | * The idle tasks have their own, simple scheduling class: |
@@ -4346,6 +4449,53 @@ fail: | |||
4346 | return ret; | 4449 | return ret; |
4347 | } | 4450 | } |
4348 | 4451 | ||
4452 | #ifdef CONFIG_NUMA_BALANCING | ||
4453 | /* Migrate current task p to target_cpu */ | ||
4454 | int migrate_task_to(struct task_struct *p, int target_cpu) | ||
4455 | { | ||
4456 | struct migration_arg arg = { p, target_cpu }; | ||
4457 | int curr_cpu = task_cpu(p); | ||
4458 | |||
4459 | if (curr_cpu == target_cpu) | ||
4460 | return 0; | ||
4461 | |||
4462 | if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) | ||
4463 | return -EINVAL; | ||
4464 | |||
4465 | /* TODO: This is not properly updating schedstats */ | ||
4466 | |||
4467 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | ||
4468 | } | ||
4469 | |||
4470 | /* | ||
4471 | * Requeue a task on a given node and accurately track the number of NUMA | ||
4472 | * tasks on the runqueues | ||
4473 | */ | ||
4474 | void sched_setnuma(struct task_struct *p, int nid) | ||
4475 | { | ||
4476 | struct rq *rq; | ||
4477 | unsigned long flags; | ||
4478 | bool on_rq, running; | ||
4479 | |||
4480 | rq = task_rq_lock(p, &flags); | ||
4481 | on_rq = p->on_rq; | ||
4482 | running = task_current(rq, p); | ||
4483 | |||
4484 | if (on_rq) | ||
4485 | dequeue_task(rq, p, 0); | ||
4486 | if (running) | ||
4487 | p->sched_class->put_prev_task(rq, p); | ||
4488 | |||
4489 | p->numa_preferred_nid = nid; | ||
4490 | |||
4491 | if (running) | ||
4492 | p->sched_class->set_curr_task(rq); | ||
4493 | if (on_rq) | ||
4494 | enqueue_task(rq, p, 0); | ||
4495 | task_rq_unlock(rq, p, &flags); | ||
4496 | } | ||
4497 | #endif | ||
4498 | |||
4349 | /* | 4499 | /* |
4350 | * migration_cpu_stop - this will be executed by a highprio stopper thread | 4500 | * migration_cpu_stop - this will be executed by a highprio stopper thread |
4351 | * and performs thread migration by bumping thread off CPU then | 4501 | * and performs thread migration by bumping thread off CPU then |
@@ -5119,6 +5269,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5119 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5269 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5120 | DEFINE_PER_CPU(int, sd_llc_size); | 5270 | DEFINE_PER_CPU(int, sd_llc_size); |
5121 | DEFINE_PER_CPU(int, sd_llc_id); | 5271 | DEFINE_PER_CPU(int, sd_llc_id); |
5272 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
5122 | 5273 | ||
5123 | static void update_top_cache_domain(int cpu) | 5274 | static void update_top_cache_domain(int cpu) |
5124 | { | 5275 | { |
@@ -5135,6 +5286,9 @@ static void update_top_cache_domain(int cpu) | |||
5135 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 5286 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5136 | per_cpu(sd_llc_size, cpu) = size; | 5287 | per_cpu(sd_llc_size, cpu) = size; |
5137 | per_cpu(sd_llc_id, cpu) = id; | 5288 | per_cpu(sd_llc_id, cpu) = id; |
5289 | |||
5290 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
5291 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
5138 | } | 5292 | } |
5139 | 5293 | ||
5140 | /* | 5294 | /* |
@@ -5654,6 +5808,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
5654 | | 0*SD_SHARE_PKG_RESOURCES | 5808 | | 0*SD_SHARE_PKG_RESOURCES |
5655 | | 1*SD_SERIALIZE | 5809 | | 1*SD_SERIALIZE |
5656 | | 0*SD_PREFER_SIBLING | 5810 | | 0*SD_PREFER_SIBLING |
5811 | | 1*SD_NUMA | ||
5657 | | sd_local_flags(level) | 5812 | | sd_local_flags(level) |
5658 | , | 5813 | , |
5659 | .last_balance = jiffies, | 5814 | .last_balance = jiffies, |
@@ -6505,6 +6660,7 @@ void __init sched_init(void) | |||
6505 | rq->online = 0; | 6660 | rq->online = 0; |
6506 | rq->idle_stamp = 0; | 6661 | rq->idle_stamp = 0; |
6507 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6662 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
6663 | rq->max_idle_balance_cost = sysctl_sched_migration_cost; | ||
6508 | 6664 | ||
6509 | INIT_LIST_HEAD(&rq->cfs_tasks); | 6665 | INIT_LIST_HEAD(&rq->cfs_tasks); |
6510 | 6666 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7c..e6ba5e31c7ca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | #include <linux/mempolicy.h> | ||
18 | 19 | ||
19 | #include "sched.h" | 20 | #include "sched.h" |
20 | 21 | ||
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
137 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 138 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
138 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
139 | #endif | 140 | #endif |
141 | #ifdef CONFIG_NUMA_BALANCING | ||
142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | ||
143 | #endif | ||
140 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
141 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
142 | #endif | 146 | #endif |
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
159 | read_lock_irqsave(&tasklist_lock, flags); | 163 | read_lock_irqsave(&tasklist_lock, flags); |
160 | 164 | ||
161 | do_each_thread(g, p) { | 165 | do_each_thread(g, p) { |
162 | if (!p->on_rq || task_cpu(p) != rq_cpu) | 166 | if (task_cpu(p) != rq_cpu) |
163 | continue; | 167 | continue; |
164 | 168 | ||
165 | print_task(m, rq, p); | 169 | print_task(m, rq, p); |
@@ -345,7 +349,7 @@ static void sched_debug_header(struct seq_file *m) | |||
345 | cpu_clk = local_clock(); | 349 | cpu_clk = local_clock(); |
346 | local_irq_restore(flags); | 350 | local_irq_restore(flags); |
347 | 351 | ||
348 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | 352 | SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", |
349 | init_utsname()->release, | 353 | init_utsname()->release, |
350 | (int)strcspn(init_utsname()->version, " "), | 354 | (int)strcspn(init_utsname()->version, " "), |
351 | init_utsname()->version); | 355 | init_utsname()->version); |
@@ -488,6 +492,56 @@ static int __init init_sched_debug_procfs(void) | |||
488 | 492 | ||
489 | __initcall(init_sched_debug_procfs); | 493 | __initcall(init_sched_debug_procfs); |
490 | 494 | ||
495 | #define __P(F) \ | ||
496 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | ||
497 | #define P(F) \ | ||
498 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | ||
499 | #define __PN(F) \ | ||
500 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
501 | #define PN(F) \ | ||
502 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
503 | |||
504 | |||
505 | static void sched_show_numa(struct task_struct *p, struct seq_file *m) | ||
506 | { | ||
507 | #ifdef CONFIG_NUMA_BALANCING | ||
508 | struct mempolicy *pol; | ||
509 | int node, i; | ||
510 | |||
511 | if (p->mm) | ||
512 | P(mm->numa_scan_seq); | ||
513 | |||
514 | task_lock(p); | ||
515 | pol = p->mempolicy; | ||
516 | if (pol && !(pol->flags & MPOL_F_MORON)) | ||
517 | pol = NULL; | ||
518 | mpol_get(pol); | ||
519 | task_unlock(p); | ||
520 | |||
521 | SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); | ||
522 | |||
523 | for_each_online_node(node) { | ||
524 | for (i = 0; i < 2; i++) { | ||
525 | unsigned long nr_faults = -1; | ||
526 | int cpu_current, home_node; | ||
527 | |||
528 | if (p->numa_faults) | ||
529 | nr_faults = p->numa_faults[2*node + i]; | ||
530 | |||
531 | cpu_current = !i ? (task_node(p) == node) : | ||
532 | (pol && node_isset(node, pol->v.nodes)); | ||
533 | |||
534 | home_node = (p->numa_preferred_nid == node); | ||
535 | |||
536 | SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", | ||
537 | i, node, cpu_current, home_node, nr_faults); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | mpol_put(pol); | ||
542 | #endif | ||
543 | } | ||
544 | |||
491 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 545 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
492 | { | 546 | { |
493 | unsigned long nr_switches; | 547 | unsigned long nr_switches; |
@@ -591,6 +645,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
591 | SEQ_printf(m, "%-45s:%21Ld\n", | 645 | SEQ_printf(m, "%-45s:%21Ld\n", |
592 | "clock-delta", (long long)(t1-t0)); | 646 | "clock-delta", (long long)(t1-t0)); |
593 | } | 647 | } |
648 | |||
649 | sched_show_numa(p, m); | ||
594 | } | 650 | } |
595 | 651 | ||
596 | void proc_sched_set_task(struct task_struct *p) | 652 | void proc_sched_set_task(struct task_struct *p) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..803e343d7c89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
681 | } | 681 | } |
682 | 682 | ||
683 | #ifdef CONFIG_SMP | 683 | #ifdef CONFIG_SMP |
684 | static unsigned long task_h_load(struct task_struct *p); | ||
685 | |||
684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 686 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
685 | 687 | ||
686 | /* Give new task start runnable values to heavy its load in infant time */ | 688 | /* Give new task start runnable values to heavy its load in infant time */ |
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
818 | 820 | ||
819 | #ifdef CONFIG_NUMA_BALANCING | 821 | #ifdef CONFIG_NUMA_BALANCING |
820 | /* | 822 | /* |
821 | * numa task sample period in ms | 823 | * Approximate time to scan a full NUMA task in ms. The task scan period is |
824 | * calculated based on the tasks virtual memory size and | ||
825 | * numa_balancing_scan_size. | ||
822 | */ | 826 | */ |
823 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 827 | unsigned int sysctl_numa_balancing_scan_period_min = 1000; |
824 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | 828 | unsigned int sysctl_numa_balancing_scan_period_max = 60000; |
825 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
826 | 829 | ||
827 | /* Portion of address space to scan in MB */ | 830 | /* Portion of address space to scan in MB */ |
828 | unsigned int sysctl_numa_balancing_scan_size = 256; | 831 | unsigned int sysctl_numa_balancing_scan_size = 256; |
@@ -830,41 +833,819 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
830 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 833 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
831 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 834 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
832 | 835 | ||
833 | static void task_numa_placement(struct task_struct *p) | 836 | /* |
837 | * After skipping a page migration on a shared page, skip N more numa page | ||
838 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
839 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
840 | * where their memory lives, over pulling the memory towards the task. | ||
841 | */ | ||
842 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
843 | |||
844 | static unsigned int task_nr_scan_windows(struct task_struct *p) | ||
845 | { | ||
846 | unsigned long rss = 0; | ||
847 | unsigned long nr_scan_pages; | ||
848 | |||
849 | /* | ||
850 | * Calculations based on RSS as non-present and empty pages are skipped | ||
851 | * by the PTE scanner and NUMA hinting faults should be trapped based | ||
852 | * on resident pages | ||
853 | */ | ||
854 | nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); | ||
855 | rss = get_mm_rss(p->mm); | ||
856 | if (!rss) | ||
857 | rss = nr_scan_pages; | ||
858 | |||
859 | rss = round_up(rss, nr_scan_pages); | ||
860 | return rss / nr_scan_pages; | ||
861 | } | ||
862 | |||
863 | /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ | ||
864 | #define MAX_SCAN_WINDOW 2560 | ||
865 | |||
866 | static unsigned int task_scan_min(struct task_struct *p) | ||
867 | { | ||
868 | unsigned int scan, floor; | ||
869 | unsigned int windows = 1; | ||
870 | |||
871 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | ||
872 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | ||
873 | floor = 1000 / windows; | ||
874 | |||
875 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | ||
876 | return max_t(unsigned int, floor, scan); | ||
877 | } | ||
878 | |||
879 | static unsigned int task_scan_max(struct task_struct *p) | ||
880 | { | ||
881 | unsigned int smin = task_scan_min(p); | ||
882 | unsigned int smax; | ||
883 | |||
884 | /* Watch for min being lower than max due to floor calculations */ | ||
885 | smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); | ||
886 | return max(smin, smax); | ||
887 | } | ||
888 | |||
889 | /* | ||
890 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
891 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
892 | * scans. This will give the process the chance to accumulate more faults on | ||
893 | * the preferred node but still allow the scheduler to move the task again if | ||
894 | * the nodes CPUs are overloaded. | ||
895 | */ | ||
896 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
897 | |||
898 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
899 | { | ||
900 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | ||
901 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | ||
902 | } | ||
903 | |||
904 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
905 | { | ||
906 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | ||
907 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | ||
908 | } | ||
909 | |||
910 | struct numa_group { | ||
911 | atomic_t refcount; | ||
912 | |||
913 | spinlock_t lock; /* nr_tasks, tasks */ | ||
914 | int nr_tasks; | ||
915 | pid_t gid; | ||
916 | struct list_head task_list; | ||
917 | |||
918 | struct rcu_head rcu; | ||
919 | unsigned long total_faults; | ||
920 | unsigned long faults[0]; | ||
921 | }; | ||
922 | |||
923 | pid_t task_numa_group_id(struct task_struct *p) | ||
924 | { | ||
925 | return p->numa_group ? p->numa_group->gid : 0; | ||
926 | } | ||
927 | |||
928 | static inline int task_faults_idx(int nid, int priv) | ||
929 | { | ||
930 | return 2 * nid + priv; | ||
931 | } | ||
932 | |||
933 | static inline unsigned long task_faults(struct task_struct *p, int nid) | ||
934 | { | ||
935 | if (!p->numa_faults) | ||
936 | return 0; | ||
937 | |||
938 | return p->numa_faults[task_faults_idx(nid, 0)] + | ||
939 | p->numa_faults[task_faults_idx(nid, 1)]; | ||
940 | } | ||
941 | |||
942 | static inline unsigned long group_faults(struct task_struct *p, int nid) | ||
943 | { | ||
944 | if (!p->numa_group) | ||
945 | return 0; | ||
946 | |||
947 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | ||
948 | } | ||
949 | |||
950 | /* | ||
951 | * These return the fraction of accesses done by a particular task, or | ||
952 | * task group, on a particular numa node. The group weight is given a | ||
953 | * larger multiplier, in order to group tasks together that are almost | ||
954 | * evenly spread out between numa nodes. | ||
955 | */ | ||
956 | static inline unsigned long task_weight(struct task_struct *p, int nid) | ||
957 | { | ||
958 | unsigned long total_faults; | ||
959 | |||
960 | if (!p->numa_faults) | ||
961 | return 0; | ||
962 | |||
963 | total_faults = p->total_numa_faults; | ||
964 | |||
965 | if (!total_faults) | ||
966 | return 0; | ||
967 | |||
968 | return 1000 * task_faults(p, nid) / total_faults; | ||
969 | } | ||
970 | |||
971 | static inline unsigned long group_weight(struct task_struct *p, int nid) | ||
972 | { | ||
973 | if (!p->numa_group || !p->numa_group->total_faults) | ||
974 | return 0; | ||
975 | |||
976 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | ||
977 | } | ||
978 | |||
979 | static unsigned long weighted_cpuload(const int cpu); | ||
980 | static unsigned long source_load(int cpu, int type); | ||
981 | static unsigned long target_load(int cpu, int type); | ||
982 | static unsigned long power_of(int cpu); | ||
983 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | ||
984 | |||
985 | /* Cached statistics for all CPUs within a node */ | ||
986 | struct numa_stats { | ||
987 | unsigned long nr_running; | ||
988 | unsigned long load; | ||
989 | |||
990 | /* Total compute capacity of CPUs on a node */ | ||
991 | unsigned long power; | ||
992 | |||
993 | /* Approximate capacity in terms of runnable tasks on a node */ | ||
994 | unsigned long capacity; | ||
995 | int has_capacity; | ||
996 | }; | ||
997 | |||
998 | /* | ||
999 | * XXX borrowed from update_sg_lb_stats | ||
1000 | */ | ||
1001 | static void update_numa_stats(struct numa_stats *ns, int nid) | ||
1002 | { | ||
1003 | int cpu; | ||
1004 | |||
1005 | memset(ns, 0, sizeof(*ns)); | ||
1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { | ||
1007 | struct rq *rq = cpu_rq(cpu); | ||
1008 | |||
1009 | ns->nr_running += rq->nr_running; | ||
1010 | ns->load += weighted_cpuload(cpu); | ||
1011 | ns->power += power_of(cpu); | ||
1012 | } | ||
1013 | |||
1014 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | ||
1015 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | ||
1016 | ns->has_capacity = (ns->nr_running < ns->capacity); | ||
1017 | } | ||
1018 | |||
1019 | struct task_numa_env { | ||
1020 | struct task_struct *p; | ||
1021 | |||
1022 | int src_cpu, src_nid; | ||
1023 | int dst_cpu, dst_nid; | ||
1024 | |||
1025 | struct numa_stats src_stats, dst_stats; | ||
1026 | |||
1027 | int imbalance_pct, idx; | ||
1028 | |||
1029 | struct task_struct *best_task; | ||
1030 | long best_imp; | ||
1031 | int best_cpu; | ||
1032 | }; | ||
1033 | |||
1034 | static void task_numa_assign(struct task_numa_env *env, | ||
1035 | struct task_struct *p, long imp) | ||
834 | { | 1036 | { |
835 | int seq; | 1037 | if (env->best_task) |
1038 | put_task_struct(env->best_task); | ||
1039 | if (p) | ||
1040 | get_task_struct(p); | ||
1041 | |||
1042 | env->best_task = p; | ||
1043 | env->best_imp = imp; | ||
1044 | env->best_cpu = env->dst_cpu; | ||
1045 | } | ||
1046 | |||
1047 | /* | ||
1048 | * This checks if the overall compute and NUMA accesses of the system would | ||
1049 | * be improved if the source tasks was migrated to the target dst_cpu taking | ||
1050 | * into account that it might be best if task running on the dst_cpu should | ||
1051 | * be exchanged with the source task | ||
1052 | */ | ||
1053 | static void task_numa_compare(struct task_numa_env *env, | ||
1054 | long taskimp, long groupimp) | ||
1055 | { | ||
1056 | struct rq *src_rq = cpu_rq(env->src_cpu); | ||
1057 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | ||
1058 | struct task_struct *cur; | ||
1059 | long dst_load, src_load; | ||
1060 | long load; | ||
1061 | long imp = (groupimp > 0) ? groupimp : taskimp; | ||
1062 | |||
1063 | rcu_read_lock(); | ||
1064 | cur = ACCESS_ONCE(dst_rq->curr); | ||
1065 | if (cur->pid == 0) /* idle */ | ||
1066 | cur = NULL; | ||
1067 | |||
1068 | /* | ||
1069 | * "imp" is the fault differential for the source task between the | ||
1070 | * source and destination node. Calculate the total differential for | ||
1071 | * the source task and potential destination task. The more negative | ||
1072 | * the value is, the more rmeote accesses that would be expected to | ||
1073 | * be incurred if the tasks were swapped. | ||
1074 | */ | ||
1075 | if (cur) { | ||
1076 | /* Skip this swap candidate if cannot move to the source cpu */ | ||
1077 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) | ||
1078 | goto unlock; | ||
1079 | |||
1080 | /* | ||
1081 | * If dst and source tasks are in the same NUMA group, or not | ||
1082 | * in any group then look only at task weights. | ||
1083 | */ | ||
1084 | if (cur->numa_group == env->p->numa_group) { | ||
1085 | imp = taskimp + task_weight(cur, env->src_nid) - | ||
1086 | task_weight(cur, env->dst_nid); | ||
1087 | /* | ||
1088 | * Add some hysteresis to prevent swapping the | ||
1089 | * tasks within a group over tiny differences. | ||
1090 | */ | ||
1091 | if (cur->numa_group) | ||
1092 | imp -= imp/16; | ||
1093 | } else { | ||
1094 | /* | ||
1095 | * Compare the group weights. If a task is all by | ||
1096 | * itself (not part of a group), use the task weight | ||
1097 | * instead. | ||
1098 | */ | ||
1099 | if (env->p->numa_group) | ||
1100 | imp = groupimp; | ||
1101 | else | ||
1102 | imp = taskimp; | ||
1103 | |||
1104 | if (cur->numa_group) | ||
1105 | imp += group_weight(cur, env->src_nid) - | ||
1106 | group_weight(cur, env->dst_nid); | ||
1107 | else | ||
1108 | imp += task_weight(cur, env->src_nid) - | ||
1109 | task_weight(cur, env->dst_nid); | ||
1110 | } | ||
1111 | } | ||
1112 | |||
1113 | if (imp < env->best_imp) | ||
1114 | goto unlock; | ||
1115 | |||
1116 | if (!cur) { | ||
1117 | /* Is there capacity at our destination? */ | ||
1118 | if (env->src_stats.has_capacity && | ||
1119 | !env->dst_stats.has_capacity) | ||
1120 | goto unlock; | ||
1121 | |||
1122 | goto balance; | ||
1123 | } | ||
1124 | |||
1125 | /* Balance doesn't matter much if we're running a task per cpu */ | ||
1126 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | ||
1127 | goto assign; | ||
836 | 1128 | ||
837 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | 1129 | /* |
1130 | * In the overloaded case, try and keep the load balanced. | ||
1131 | */ | ||
1132 | balance: | ||
1133 | dst_load = env->dst_stats.load; | ||
1134 | src_load = env->src_stats.load; | ||
1135 | |||
1136 | /* XXX missing power terms */ | ||
1137 | load = task_h_load(env->p); | ||
1138 | dst_load += load; | ||
1139 | src_load -= load; | ||
1140 | |||
1141 | if (cur) { | ||
1142 | load = task_h_load(cur); | ||
1143 | dst_load -= load; | ||
1144 | src_load += load; | ||
1145 | } | ||
1146 | |||
1147 | /* make src_load the smaller */ | ||
1148 | if (dst_load < src_load) | ||
1149 | swap(dst_load, src_load); | ||
1150 | |||
1151 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
1152 | goto unlock; | ||
1153 | |||
1154 | assign: | ||
1155 | task_numa_assign(env, cur, imp); | ||
1156 | unlock: | ||
1157 | rcu_read_unlock(); | ||
1158 | } | ||
1159 | |||
1160 | static void task_numa_find_cpu(struct task_numa_env *env, | ||
1161 | long taskimp, long groupimp) | ||
1162 | { | ||
1163 | int cpu; | ||
1164 | |||
1165 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | ||
1166 | /* Skip this CPU if the source task cannot migrate */ | ||
1167 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) | ||
1168 | continue; | ||
1169 | |||
1170 | env->dst_cpu = cpu; | ||
1171 | task_numa_compare(env, taskimp, groupimp); | ||
1172 | } | ||
1173 | } | ||
1174 | |||
1175 | static int task_numa_migrate(struct task_struct *p) | ||
1176 | { | ||
1177 | struct task_numa_env env = { | ||
1178 | .p = p, | ||
1179 | |||
1180 | .src_cpu = task_cpu(p), | ||
1181 | .src_nid = task_node(p), | ||
1182 | |||
1183 | .imbalance_pct = 112, | ||
1184 | |||
1185 | .best_task = NULL, | ||
1186 | .best_imp = 0, | ||
1187 | .best_cpu = -1 | ||
1188 | }; | ||
1189 | struct sched_domain *sd; | ||
1190 | unsigned long taskweight, groupweight; | ||
1191 | int nid, ret; | ||
1192 | long taskimp, groupimp; | ||
1193 | |||
1194 | /* | ||
1195 | * Pick the lowest SD_NUMA domain, as that would have the smallest | ||
1196 | * imbalance and would be the first to start moving tasks about. | ||
1197 | * | ||
1198 | * And we want to avoid any moving of tasks about, as that would create | ||
1199 | * random movement of tasks -- counter the numa conditions we're trying | ||
1200 | * to satisfy here. | ||
1201 | */ | ||
1202 | rcu_read_lock(); | ||
1203 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | ||
1204 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | ||
1205 | rcu_read_unlock(); | ||
1206 | |||
1207 | taskweight = task_weight(p, env.src_nid); | ||
1208 | groupweight = group_weight(p, env.src_nid); | ||
1209 | update_numa_stats(&env.src_stats, env.src_nid); | ||
1210 | env.dst_nid = p->numa_preferred_nid; | ||
1211 | taskimp = task_weight(p, env.dst_nid) - taskweight; | ||
1212 | groupimp = group_weight(p, env.dst_nid) - groupweight; | ||
1213 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
1214 | |||
1215 | /* If the preferred nid has capacity, try to use it. */ | ||
1216 | if (env.dst_stats.has_capacity) | ||
1217 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1218 | |||
1219 | /* No space available on the preferred nid. Look elsewhere. */ | ||
1220 | if (env.best_cpu == -1) { | ||
1221 | for_each_online_node(nid) { | ||
1222 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | ||
1223 | continue; | ||
1224 | |||
1225 | /* Only consider nodes where both task and groups benefit */ | ||
1226 | taskimp = task_weight(p, nid) - taskweight; | ||
1227 | groupimp = group_weight(p, nid) - groupweight; | ||
1228 | if (taskimp < 0 && groupimp < 0) | ||
1229 | continue; | ||
1230 | |||
1231 | env.dst_nid = nid; | ||
1232 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
1233 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1234 | } | ||
1235 | } | ||
1236 | |||
1237 | /* No better CPU than the current one was found. */ | ||
1238 | if (env.best_cpu == -1) | ||
1239 | return -EAGAIN; | ||
1240 | |||
1241 | sched_setnuma(p, env.dst_nid); | ||
1242 | |||
1243 | /* | ||
1244 | * Reset the scan period if the task is being rescheduled on an | ||
1245 | * alternative node to recheck if the tasks is now properly placed. | ||
1246 | */ | ||
1247 | p->numa_scan_period = task_scan_min(p); | ||
1248 | |||
1249 | if (env.best_task == NULL) { | ||
1250 | int ret = migrate_task_to(p, env.best_cpu); | ||
1251 | return ret; | ||
1252 | } | ||
1253 | |||
1254 | ret = migrate_swap(p, env.best_task); | ||
1255 | put_task_struct(env.best_task); | ||
1256 | return ret; | ||
1257 | } | ||
1258 | |||
1259 | /* Attempt to migrate a task to a CPU on the preferred node. */ | ||
1260 | static void numa_migrate_preferred(struct task_struct *p) | ||
1261 | { | ||
1262 | /* This task has no NUMA fault statistics yet */ | ||
1263 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | ||
1264 | return; | ||
1265 | |||
1266 | /* Periodically retry migrating the task to the preferred node */ | ||
1267 | p->numa_migrate_retry = jiffies + HZ; | ||
1268 | |||
1269 | /* Success if task is already running on preferred CPU */ | ||
1270 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | ||
838 | return; | 1271 | return; |
1272 | |||
1273 | /* Otherwise, try migrate to a CPU on the preferred node */ | ||
1274 | task_numa_migrate(p); | ||
1275 | } | ||
1276 | |||
1277 | /* | ||
1278 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | ||
1279 | * increments. The more local the fault statistics are, the higher the scan | ||
1280 | * period will be for the next scan window. If local/remote ratio is below | ||
1281 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | ||
1282 | * scan period will decrease | ||
1283 | */ | ||
1284 | #define NUMA_PERIOD_SLOTS 10 | ||
1285 | #define NUMA_PERIOD_THRESHOLD 3 | ||
1286 | |||
1287 | /* | ||
1288 | * Increase the scan period (slow down scanning) if the majority of | ||
1289 | * our memory is already on our local node, or if the majority of | ||
1290 | * the page accesses are shared with other processes. | ||
1291 | * Otherwise, decrease the scan period. | ||
1292 | */ | ||
1293 | static void update_task_scan_period(struct task_struct *p, | ||
1294 | unsigned long shared, unsigned long private) | ||
1295 | { | ||
1296 | unsigned int period_slot; | ||
1297 | int ratio; | ||
1298 | int diff; | ||
1299 | |||
1300 | unsigned long remote = p->numa_faults_locality[0]; | ||
1301 | unsigned long local = p->numa_faults_locality[1]; | ||
1302 | |||
1303 | /* | ||
1304 | * If there were no record hinting faults then either the task is | ||
1305 | * completely idle or all activity is areas that are not of interest | ||
1306 | * to automatic numa balancing. Scan slower | ||
1307 | */ | ||
1308 | if (local + shared == 0) { | ||
1309 | p->numa_scan_period = min(p->numa_scan_period_max, | ||
1310 | p->numa_scan_period << 1); | ||
1311 | |||
1312 | p->mm->numa_next_scan = jiffies + | ||
1313 | msecs_to_jiffies(p->numa_scan_period); | ||
1314 | |||
1315 | return; | ||
1316 | } | ||
1317 | |||
1318 | /* | ||
1319 | * Prepare to scale scan period relative to the current period. | ||
1320 | * == NUMA_PERIOD_THRESHOLD scan period stays the same | ||
1321 | * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) | ||
1322 | * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) | ||
1323 | */ | ||
1324 | period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); | ||
1325 | ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); | ||
1326 | if (ratio >= NUMA_PERIOD_THRESHOLD) { | ||
1327 | int slot = ratio - NUMA_PERIOD_THRESHOLD; | ||
1328 | if (!slot) | ||
1329 | slot = 1; | ||
1330 | diff = slot * period_slot; | ||
1331 | } else { | ||
1332 | diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; | ||
1333 | |||
1334 | /* | ||
1335 | * Scale scan rate increases based on sharing. There is an | ||
1336 | * inverse relationship between the degree of sharing and | ||
1337 | * the adjustment made to the scanning period. Broadly | ||
1338 | * speaking the intent is that there is little point | ||
1339 | * scanning faster if shared accesses dominate as it may | ||
1340 | * simply bounce migrations uselessly | ||
1341 | */ | ||
1342 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
1343 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | ||
1344 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | ||
1345 | } | ||
1346 | |||
1347 | p->numa_scan_period = clamp(p->numa_scan_period + diff, | ||
1348 | task_scan_min(p), task_scan_max(p)); | ||
1349 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
1350 | } | ||
1351 | |||
1352 | static void task_numa_placement(struct task_struct *p) | ||
1353 | { | ||
1354 | int seq, nid, max_nid = -1, max_group_nid = -1; | ||
1355 | unsigned long max_faults = 0, max_group_faults = 0; | ||
1356 | unsigned long fault_types[2] = { 0, 0 }; | ||
1357 | spinlock_t *group_lock = NULL; | ||
1358 | |||
839 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1359 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
840 | if (p->numa_scan_seq == seq) | 1360 | if (p->numa_scan_seq == seq) |
841 | return; | 1361 | return; |
842 | p->numa_scan_seq = seq; | 1362 | p->numa_scan_seq = seq; |
1363 | p->numa_scan_period_max = task_scan_max(p); | ||
1364 | |||
1365 | /* If the task is part of a group prevent parallel updates to group stats */ | ||
1366 | if (p->numa_group) { | ||
1367 | group_lock = &p->numa_group->lock; | ||
1368 | spin_lock(group_lock); | ||
1369 | } | ||
1370 | |||
1371 | /* Find the node with the highest number of faults */ | ||
1372 | for_each_online_node(nid) { | ||
1373 | unsigned long faults = 0, group_faults = 0; | ||
1374 | int priv, i; | ||
1375 | |||
1376 | for (priv = 0; priv < 2; priv++) { | ||
1377 | long diff; | ||
1378 | |||
1379 | i = task_faults_idx(nid, priv); | ||
1380 | diff = -p->numa_faults[i]; | ||
1381 | |||
1382 | /* Decay existing window, copy faults since last scan */ | ||
1383 | p->numa_faults[i] >>= 1; | ||
1384 | p->numa_faults[i] += p->numa_faults_buffer[i]; | ||
1385 | fault_types[priv] += p->numa_faults_buffer[i]; | ||
1386 | p->numa_faults_buffer[i] = 0; | ||
1387 | |||
1388 | faults += p->numa_faults[i]; | ||
1389 | diff += p->numa_faults[i]; | ||
1390 | p->total_numa_faults += diff; | ||
1391 | if (p->numa_group) { | ||
1392 | /* safe because we can only change our own group */ | ||
1393 | p->numa_group->faults[i] += diff; | ||
1394 | p->numa_group->total_faults += diff; | ||
1395 | group_faults += p->numa_group->faults[i]; | ||
1396 | } | ||
1397 | } | ||
1398 | |||
1399 | if (faults > max_faults) { | ||
1400 | max_faults = faults; | ||
1401 | max_nid = nid; | ||
1402 | } | ||
1403 | |||
1404 | if (group_faults > max_group_faults) { | ||
1405 | max_group_faults = group_faults; | ||
1406 | max_group_nid = nid; | ||
1407 | } | ||
1408 | } | ||
1409 | |||
1410 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
1411 | |||
1412 | if (p->numa_group) { | ||
1413 | /* | ||
1414 | * If the preferred task and group nids are different, | ||
1415 | * iterate over the nodes again to find the best place. | ||
1416 | */ | ||
1417 | if (max_nid != max_group_nid) { | ||
1418 | unsigned long weight, max_weight = 0; | ||
1419 | |||
1420 | for_each_online_node(nid) { | ||
1421 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
1422 | if (weight > max_weight) { | ||
1423 | max_weight = weight; | ||
1424 | max_nid = nid; | ||
1425 | } | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | spin_unlock(group_lock); | ||
1430 | } | ||
1431 | |||
1432 | /* Preferred node as the node with the most faults */ | ||
1433 | if (max_faults && max_nid != p->numa_preferred_nid) { | ||
1434 | /* Update the preferred nid and migrate task if possible */ | ||
1435 | sched_setnuma(p, max_nid); | ||
1436 | numa_migrate_preferred(p); | ||
1437 | } | ||
1438 | } | ||
1439 | |||
1440 | static inline int get_numa_group(struct numa_group *grp) | ||
1441 | { | ||
1442 | return atomic_inc_not_zero(&grp->refcount); | ||
1443 | } | ||
1444 | |||
1445 | static inline void put_numa_group(struct numa_group *grp) | ||
1446 | { | ||
1447 | if (atomic_dec_and_test(&grp->refcount)) | ||
1448 | kfree_rcu(grp, rcu); | ||
1449 | } | ||
1450 | |||
1451 | static void double_lock(spinlock_t *l1, spinlock_t *l2) | ||
1452 | { | ||
1453 | if (l1 > l2) | ||
1454 | swap(l1, l2); | ||
1455 | |||
1456 | spin_lock(l1); | ||
1457 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
1458 | } | ||
1459 | |||
1460 | static void task_numa_group(struct task_struct *p, int cpupid, int flags, | ||
1461 | int *priv) | ||
1462 | { | ||
1463 | struct numa_group *grp, *my_grp; | ||
1464 | struct task_struct *tsk; | ||
1465 | bool join = false; | ||
1466 | int cpu = cpupid_to_cpu(cpupid); | ||
1467 | int i; | ||
1468 | |||
1469 | if (unlikely(!p->numa_group)) { | ||
1470 | unsigned int size = sizeof(struct numa_group) + | ||
1471 | 2*nr_node_ids*sizeof(unsigned long); | ||
1472 | |||
1473 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | ||
1474 | if (!grp) | ||
1475 | return; | ||
1476 | |||
1477 | atomic_set(&grp->refcount, 1); | ||
1478 | spin_lock_init(&grp->lock); | ||
1479 | INIT_LIST_HEAD(&grp->task_list); | ||
1480 | grp->gid = p->pid; | ||
1481 | |||
1482 | for (i = 0; i < 2*nr_node_ids; i++) | ||
1483 | grp->faults[i] = p->numa_faults[i]; | ||
1484 | |||
1485 | grp->total_faults = p->total_numa_faults; | ||
1486 | |||
1487 | list_add(&p->numa_entry, &grp->task_list); | ||
1488 | grp->nr_tasks++; | ||
1489 | rcu_assign_pointer(p->numa_group, grp); | ||
1490 | } | ||
1491 | |||
1492 | rcu_read_lock(); | ||
1493 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | ||
1494 | |||
1495 | if (!cpupid_match_pid(tsk, cpupid)) | ||
1496 | goto no_join; | ||
1497 | |||
1498 | grp = rcu_dereference(tsk->numa_group); | ||
1499 | if (!grp) | ||
1500 | goto no_join; | ||
1501 | |||
1502 | my_grp = p->numa_group; | ||
1503 | if (grp == my_grp) | ||
1504 | goto no_join; | ||
1505 | |||
1506 | /* | ||
1507 | * Only join the other group if its bigger; if we're the bigger group, | ||
1508 | * the other task will join us. | ||
1509 | */ | ||
1510 | if (my_grp->nr_tasks > grp->nr_tasks) | ||
1511 | goto no_join; | ||
1512 | |||
1513 | /* | ||
1514 | * Tie-break on the grp address. | ||
1515 | */ | ||
1516 | if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) | ||
1517 | goto no_join; | ||
1518 | |||
1519 | /* Always join threads in the same process. */ | ||
1520 | if (tsk->mm == current->mm) | ||
1521 | join = true; | ||
1522 | |||
1523 | /* Simple filter to avoid false positives due to PID collisions */ | ||
1524 | if (flags & TNF_SHARED) | ||
1525 | join = true; | ||
1526 | |||
1527 | /* Update priv based on whether false sharing was detected */ | ||
1528 | *priv = !join; | ||
1529 | |||
1530 | if (join && !get_numa_group(grp)) | ||
1531 | goto no_join; | ||
1532 | |||
1533 | rcu_read_unlock(); | ||
843 | 1534 | ||
844 | /* FIXME: Scheduling placement policy hints go here */ | 1535 | if (!join) |
1536 | return; | ||
1537 | |||
1538 | double_lock(&my_grp->lock, &grp->lock); | ||
1539 | |||
1540 | for (i = 0; i < 2*nr_node_ids; i++) { | ||
1541 | my_grp->faults[i] -= p->numa_faults[i]; | ||
1542 | grp->faults[i] += p->numa_faults[i]; | ||
1543 | } | ||
1544 | my_grp->total_faults -= p->total_numa_faults; | ||
1545 | grp->total_faults += p->total_numa_faults; | ||
1546 | |||
1547 | list_move(&p->numa_entry, &grp->task_list); | ||
1548 | my_grp->nr_tasks--; | ||
1549 | grp->nr_tasks++; | ||
1550 | |||
1551 | spin_unlock(&my_grp->lock); | ||
1552 | spin_unlock(&grp->lock); | ||
1553 | |||
1554 | rcu_assign_pointer(p->numa_group, grp); | ||
1555 | |||
1556 | put_numa_group(my_grp); | ||
1557 | return; | ||
1558 | |||
1559 | no_join: | ||
1560 | rcu_read_unlock(); | ||
1561 | return; | ||
1562 | } | ||
1563 | |||
1564 | void task_numa_free(struct task_struct *p) | ||
1565 | { | ||
1566 | struct numa_group *grp = p->numa_group; | ||
1567 | int i; | ||
1568 | void *numa_faults = p->numa_faults; | ||
1569 | |||
1570 | if (grp) { | ||
1571 | spin_lock(&grp->lock); | ||
1572 | for (i = 0; i < 2*nr_node_ids; i++) | ||
1573 | grp->faults[i] -= p->numa_faults[i]; | ||
1574 | grp->total_faults -= p->total_numa_faults; | ||
1575 | |||
1576 | list_del(&p->numa_entry); | ||
1577 | grp->nr_tasks--; | ||
1578 | spin_unlock(&grp->lock); | ||
1579 | rcu_assign_pointer(p->numa_group, NULL); | ||
1580 | put_numa_group(grp); | ||
1581 | } | ||
1582 | |||
1583 | p->numa_faults = NULL; | ||
1584 | p->numa_faults_buffer = NULL; | ||
1585 | kfree(numa_faults); | ||
845 | } | 1586 | } |
846 | 1587 | ||
847 | /* | 1588 | /* |
848 | * Got a PROT_NONE fault for a page on @node. | 1589 | * Got a PROT_NONE fault for a page on @node. |
849 | */ | 1590 | */ |
850 | void task_numa_fault(int node, int pages, bool migrated) | 1591 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) |
851 | { | 1592 | { |
852 | struct task_struct *p = current; | 1593 | struct task_struct *p = current; |
1594 | bool migrated = flags & TNF_MIGRATED; | ||
1595 | int priv; | ||
853 | 1596 | ||
854 | if (!numabalancing_enabled) | 1597 | if (!numabalancing_enabled) |
855 | return; | 1598 | return; |
856 | 1599 | ||
857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 1600 | /* for example, ksmd faulting in a user's mm */ |
1601 | if (!p->mm) | ||
1602 | return; | ||
1603 | |||
1604 | /* Do not worry about placement if exiting */ | ||
1605 | if (p->state == TASK_DEAD) | ||
1606 | return; | ||
1607 | |||
1608 | /* Allocate buffer to track faults on a per-node basis */ | ||
1609 | if (unlikely(!p->numa_faults)) { | ||
1610 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | ||
1611 | |||
1612 | /* numa_faults and numa_faults_buffer share the allocation */ | ||
1613 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | ||
1614 | if (!p->numa_faults) | ||
1615 | return; | ||
1616 | |||
1617 | BUG_ON(p->numa_faults_buffer); | ||
1618 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | ||
1619 | p->total_numa_faults = 0; | ||
1620 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
1621 | } | ||
858 | 1622 | ||
859 | /* | 1623 | /* |
860 | * If pages are properly placed (did not migrate) then scan slower. | 1624 | * First accesses are treated as private, otherwise consider accesses |
861 | * This is reset periodically in case of phase changes | 1625 | * to be private if the accessing pid has not changed |
862 | */ | 1626 | */ |
863 | if (!migrated) | 1627 | if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { |
864 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 1628 | priv = 1; |
865 | p->numa_scan_period + jiffies_to_msecs(10)); | 1629 | } else { |
1630 | priv = cpupid_match_pid(p, last_cpupid); | ||
1631 | if (!priv && !(flags & TNF_NO_GROUP)) | ||
1632 | task_numa_group(p, last_cpupid, flags, &priv); | ||
1633 | } | ||
866 | 1634 | ||
867 | task_numa_placement(p); | 1635 | task_numa_placement(p); |
1636 | |||
1637 | /* | ||
1638 | * Retry task to preferred node migration periodically, in case it | ||
1639 | * case it previously failed, or the scheduler moved us. | ||
1640 | */ | ||
1641 | if (time_after(jiffies, p->numa_migrate_retry)) | ||
1642 | numa_migrate_preferred(p); | ||
1643 | |||
1644 | if (migrated) | ||
1645 | p->numa_pages_migrated += pages; | ||
1646 | |||
1647 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | ||
1648 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | ||
868 | } | 1649 | } |
869 | 1650 | ||
870 | static void reset_ptenuma_scan(struct task_struct *p) | 1651 | static void reset_ptenuma_scan(struct task_struct *p) |
@@ -884,6 +1665,7 @@ void task_numa_work(struct callback_head *work) | |||
884 | struct mm_struct *mm = p->mm; | 1665 | struct mm_struct *mm = p->mm; |
885 | struct vm_area_struct *vma; | 1666 | struct vm_area_struct *vma; |
886 | unsigned long start, end; | 1667 | unsigned long start, end; |
1668 | unsigned long nr_pte_updates = 0; | ||
887 | long pages; | 1669 | long pages; |
888 | 1670 | ||
889 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 1671 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
@@ -900,35 +1682,9 @@ void task_numa_work(struct callback_head *work) | |||
900 | if (p->flags & PF_EXITING) | 1682 | if (p->flags & PF_EXITING) |
901 | return; | 1683 | return; |
902 | 1684 | ||
903 | /* | 1685 | if (!mm->numa_next_scan) { |
904 | * We do not care about task placement until a task runs on a node | 1686 | mm->numa_next_scan = now + |
905 | * other than the first one used by the address space. This is | 1687 | msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
906 | * largely because migrations are driven by what CPU the task | ||
907 | * is running on. If it's never scheduled on another node, it'll | ||
908 | * not migrate so why bother trapping the fault. | ||
909 | */ | ||
910 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
911 | mm->first_nid = numa_node_id(); | ||
912 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
913 | /* Are we running on a new node yet? */ | ||
914 | if (numa_node_id() == mm->first_nid && | ||
915 | !sched_feat_numa(NUMA_FORCE)) | ||
916 | return; | ||
917 | |||
918 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
919 | } | ||
920 | |||
921 | /* | ||
922 | * Reset the scan period if enough time has gone by. Objective is that | ||
923 | * scanning will be reduced if pages are properly placed. As tasks | ||
924 | * can enter different phases this needs to be re-examined. Lacking | ||
925 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
926 | */ | ||
927 | migrate = mm->numa_next_reset; | ||
928 | if (time_after(now, migrate)) { | ||
929 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
930 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
931 | xchg(&mm->numa_next_reset, next_scan); | ||
932 | } | 1688 | } |
933 | 1689 | ||
934 | /* | 1690 | /* |
@@ -938,20 +1694,20 @@ void task_numa_work(struct callback_head *work) | |||
938 | if (time_before(now, migrate)) | 1694 | if (time_before(now, migrate)) |
939 | return; | 1695 | return; |
940 | 1696 | ||
941 | if (p->numa_scan_period == 0) | 1697 | if (p->numa_scan_period == 0) { |
942 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1698 | p->numa_scan_period_max = task_scan_max(p); |
1699 | p->numa_scan_period = task_scan_min(p); | ||
1700 | } | ||
943 | 1701 | ||
944 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | 1702 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); |
945 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | 1703 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) |
946 | return; | 1704 | return; |
947 | 1705 | ||
948 | /* | 1706 | /* |
949 | * Do not set pte_numa if the current running node is rate-limited. | 1707 | * Delay this task enough that another task of this mm will likely win |
950 | * This loses statistics on the fault but if we are unwilling to | 1708 | * the next time around. |
951 | * migrate to this node, it is less likely we can do useful work | ||
952 | */ | 1709 | */ |
953 | if (migrate_ratelimited(numa_node_id())) | 1710 | p->node_stamp += 2 * TICK_NSEC; |
954 | return; | ||
955 | 1711 | ||
956 | start = mm->numa_scan_offset; | 1712 | start = mm->numa_scan_offset; |
957 | pages = sysctl_numa_balancing_scan_size; | 1713 | pages = sysctl_numa_balancing_scan_size; |
@@ -967,18 +1723,32 @@ void task_numa_work(struct callback_head *work) | |||
967 | vma = mm->mmap; | 1723 | vma = mm->mmap; |
968 | } | 1724 | } |
969 | for (; vma; vma = vma->vm_next) { | 1725 | for (; vma; vma = vma->vm_next) { |
970 | if (!vma_migratable(vma)) | 1726 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) |
971 | continue; | 1727 | continue; |
972 | 1728 | ||
973 | /* Skip small VMAs. They are not likely to be of relevance */ | 1729 | /* |
974 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 1730 | * Shared library pages mapped by multiple processes are not |
1731 | * migrated as it is expected they are cache replicated. Avoid | ||
1732 | * hinting faults in read-only file-backed mappings or the vdso | ||
1733 | * as migrating the pages will be of marginal benefit. | ||
1734 | */ | ||
1735 | if (!vma->vm_mm || | ||
1736 | (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) | ||
975 | continue; | 1737 | continue; |
976 | 1738 | ||
977 | do { | 1739 | do { |
978 | start = max(start, vma->vm_start); | 1740 | start = max(start, vma->vm_start); |
979 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 1741 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
980 | end = min(end, vma->vm_end); | 1742 | end = min(end, vma->vm_end); |
981 | pages -= change_prot_numa(vma, start, end); | 1743 | nr_pte_updates += change_prot_numa(vma, start, end); |
1744 | |||
1745 | /* | ||
1746 | * Scan sysctl_numa_balancing_scan_size but ensure that | ||
1747 | * at least one PTE is updated so that unused virtual | ||
1748 | * address space is quickly skipped. | ||
1749 | */ | ||
1750 | if (nr_pte_updates) | ||
1751 | pages -= (end - start) >> PAGE_SHIFT; | ||
982 | 1752 | ||
983 | start = end; | 1753 | start = end; |
984 | if (pages <= 0) | 1754 | if (pages <= 0) |
@@ -988,10 +1758,10 @@ void task_numa_work(struct callback_head *work) | |||
988 | 1758 | ||
989 | out: | 1759 | out: |
990 | /* | 1760 | /* |
991 | * It is possible to reach the end of the VMA list but the last few VMAs are | 1761 | * It is possible to reach the end of the VMA list but the last few |
992 | * not guaranteed to the vma_migratable. If they are not, we would find the | 1762 | * VMAs are not guaranteed to the vma_migratable. If they are not, we |
993 | * !migratable VMA on the next scan but not reset the scanner to the start | 1763 | * would find the !migratable VMA on the next scan but not reset the |
994 | * so check it now. | 1764 | * scanner to the start so check it now. |
995 | */ | 1765 | */ |
996 | if (vma) | 1766 | if (vma) |
997 | mm->numa_scan_offset = start; | 1767 | mm->numa_scan_offset = start; |
@@ -1025,8 +1795,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
1025 | 1795 | ||
1026 | if (now - curr->node_stamp > period) { | 1796 | if (now - curr->node_stamp > period) { |
1027 | if (!curr->node_stamp) | 1797 | if (!curr->node_stamp) |
1028 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1798 | curr->numa_scan_period = task_scan_min(curr); |
1029 | curr->node_stamp = now; | 1799 | curr->node_stamp += period; |
1030 | 1800 | ||
1031 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | 1801 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { |
1032 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | 1802 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ |
@@ -1038,6 +1808,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
1038 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 1808 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
1039 | { | 1809 | { |
1040 | } | 1810 | } |
1811 | |||
1812 | static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
1813 | { | ||
1814 | } | ||
1815 | |||
1816 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
1817 | { | ||
1818 | } | ||
1041 | #endif /* CONFIG_NUMA_BALANCING */ | 1819 | #endif /* CONFIG_NUMA_BALANCING */ |
1042 | 1820 | ||
1043 | static void | 1821 | static void |
@@ -1047,8 +1825,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1047 | if (!parent_entity(se)) | 1825 | if (!parent_entity(se)) |
1048 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 1826 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
1049 | #ifdef CONFIG_SMP | 1827 | #ifdef CONFIG_SMP |
1050 | if (entity_is_task(se)) | 1828 | if (entity_is_task(se)) { |
1051 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 1829 | struct rq *rq = rq_of(cfs_rq); |
1830 | |||
1831 | account_numa_enqueue(rq, task_of(se)); | ||
1832 | list_add(&se->group_node, &rq->cfs_tasks); | ||
1833 | } | ||
1052 | #endif | 1834 | #endif |
1053 | cfs_rq->nr_running++; | 1835 | cfs_rq->nr_running++; |
1054 | } | 1836 | } |
@@ -1059,8 +1841,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1059 | update_load_sub(&cfs_rq->load, se->load.weight); | 1841 | update_load_sub(&cfs_rq->load, se->load.weight); |
1060 | if (!parent_entity(se)) | 1842 | if (!parent_entity(se)) |
1061 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 1843 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
1062 | if (entity_is_task(se)) | 1844 | if (entity_is_task(se)) { |
1845 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | ||
1063 | list_del_init(&se->group_node); | 1846 | list_del_init(&se->group_node); |
1847 | } | ||
1064 | cfs_rq->nr_running--; | 1848 | cfs_rq->nr_running--; |
1065 | } | 1849 | } |
1066 | 1850 | ||
@@ -3113,7 +3897,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
3113 | { | 3897 | { |
3114 | struct sched_entity *se = tg->se[cpu]; | 3898 | struct sched_entity *se = tg->se[cpu]; |
3115 | 3899 | ||
3116 | if (!tg->parent) /* the trivial, non-cgroup case */ | 3900 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ |
3117 | return wl; | 3901 | return wl; |
3118 | 3902 | ||
3119 | for_each_sched_entity(se) { | 3903 | for_each_sched_entity(se) { |
@@ -3166,8 +3950,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
3166 | } | 3950 | } |
3167 | #else | 3951 | #else |
3168 | 3952 | ||
3169 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 3953 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
3170 | unsigned long wl, unsigned long wg) | ||
3171 | { | 3954 | { |
3172 | return wl; | 3955 | return wl; |
3173 | } | 3956 | } |
@@ -3420,11 +4203,10 @@ done: | |||
3420 | * preempt must be disabled. | 4203 | * preempt must be disabled. |
3421 | */ | 4204 | */ |
3422 | static int | 4205 | static int |
3423 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 4206 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) |
3424 | { | 4207 | { |
3425 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 4208 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
3426 | int cpu = smp_processor_id(); | 4209 | int cpu = smp_processor_id(); |
3427 | int prev_cpu = task_cpu(p); | ||
3428 | int new_cpu = cpu; | 4210 | int new_cpu = cpu; |
3429 | int want_affine = 0; | 4211 | int want_affine = 0; |
3430 | int sync = wake_flags & WF_SYNC; | 4212 | int sync = wake_flags & WF_SYNC; |
@@ -3904,9 +4686,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3904 | 4686 | ||
3905 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 4687 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
3906 | 4688 | ||
4689 | enum fbq_type { regular, remote, all }; | ||
4690 | |||
3907 | #define LBF_ALL_PINNED 0x01 | 4691 | #define LBF_ALL_PINNED 0x01 |
3908 | #define LBF_NEED_BREAK 0x02 | 4692 | #define LBF_NEED_BREAK 0x02 |
3909 | #define LBF_SOME_PINNED 0x04 | 4693 | #define LBF_DST_PINNED 0x04 |
4694 | #define LBF_SOME_PINNED 0x08 | ||
3910 | 4695 | ||
3911 | struct lb_env { | 4696 | struct lb_env { |
3912 | struct sched_domain *sd; | 4697 | struct sched_domain *sd; |
@@ -3929,6 +4714,8 @@ struct lb_env { | |||
3929 | unsigned int loop; | 4714 | unsigned int loop; |
3930 | unsigned int loop_break; | 4715 | unsigned int loop_break; |
3931 | unsigned int loop_max; | 4716 | unsigned int loop_max; |
4717 | |||
4718 | enum fbq_type fbq_type; | ||
3932 | }; | 4719 | }; |
3933 | 4720 | ||
3934 | /* | 4721 | /* |
@@ -3975,6 +4762,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
3975 | return delta < (s64)sysctl_sched_migration_cost; | 4762 | return delta < (s64)sysctl_sched_migration_cost; |
3976 | } | 4763 | } |
3977 | 4764 | ||
4765 | #ifdef CONFIG_NUMA_BALANCING | ||
4766 | /* Returns true if the destination node has incurred more faults */ | ||
4767 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | ||
4768 | { | ||
4769 | int src_nid, dst_nid; | ||
4770 | |||
4771 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | ||
4772 | !(env->sd->flags & SD_NUMA)) { | ||
4773 | return false; | ||
4774 | } | ||
4775 | |||
4776 | src_nid = cpu_to_node(env->src_cpu); | ||
4777 | dst_nid = cpu_to_node(env->dst_cpu); | ||
4778 | |||
4779 | if (src_nid == dst_nid) | ||
4780 | return false; | ||
4781 | |||
4782 | /* Always encourage migration to the preferred node. */ | ||
4783 | if (dst_nid == p->numa_preferred_nid) | ||
4784 | return true; | ||
4785 | |||
4786 | /* If both task and group weight improve, this move is a winner. */ | ||
4787 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | ||
4788 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
4789 | return true; | ||
4790 | |||
4791 | return false; | ||
4792 | } | ||
4793 | |||
4794 | |||
4795 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | ||
4796 | { | ||
4797 | int src_nid, dst_nid; | ||
4798 | |||
4799 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | ||
4800 | return false; | ||
4801 | |||
4802 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | ||
4803 | return false; | ||
4804 | |||
4805 | src_nid = cpu_to_node(env->src_cpu); | ||
4806 | dst_nid = cpu_to_node(env->dst_cpu); | ||
4807 | |||
4808 | if (src_nid == dst_nid) | ||
4809 | return false; | ||
4810 | |||
4811 | /* Migrating away from the preferred node is always bad. */ | ||
4812 | if (src_nid == p->numa_preferred_nid) | ||
4813 | return true; | ||
4814 | |||
4815 | /* If either task or group weight get worse, don't do it. */ | ||
4816 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
4817 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
4818 | return true; | ||
4819 | |||
4820 | return false; | ||
4821 | } | ||
4822 | |||
4823 | #else | ||
4824 | static inline bool migrate_improves_locality(struct task_struct *p, | ||
4825 | struct lb_env *env) | ||
4826 | { | ||
4827 | return false; | ||
4828 | } | ||
4829 | |||
4830 | static inline bool migrate_degrades_locality(struct task_struct *p, | ||
4831 | struct lb_env *env) | ||
4832 | { | ||
4833 | return false; | ||
4834 | } | ||
4835 | #endif | ||
4836 | |||
3978 | /* | 4837 | /* |
3979 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 4838 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
3980 | */ | 4839 | */ |
@@ -3997,6 +4856,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3997 | 4856 | ||
3998 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 4857 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3999 | 4858 | ||
4859 | env->flags |= LBF_SOME_PINNED; | ||
4860 | |||
4000 | /* | 4861 | /* |
4001 | * Remember if this task can be migrated to any other cpu in | 4862 | * Remember if this task can be migrated to any other cpu in |
4002 | * our sched_group. We may want to revisit it if we couldn't | 4863 | * our sched_group. We may want to revisit it if we couldn't |
@@ -4005,13 +4866,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
4005 | * Also avoid computing new_dst_cpu if we have already computed | 4866 | * Also avoid computing new_dst_cpu if we have already computed |
4006 | * one in current iteration. | 4867 | * one in current iteration. |
4007 | */ | 4868 | */ |
4008 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 4869 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) |
4009 | return 0; | 4870 | return 0; |
4010 | 4871 | ||
4011 | /* Prevent to re-select dst_cpu via env's cpus */ | 4872 | /* Prevent to re-select dst_cpu via env's cpus */ |
4012 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 4873 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
4013 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { | 4874 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
4014 | env->flags |= LBF_SOME_PINNED; | 4875 | env->flags |= LBF_DST_PINNED; |
4015 | env->new_dst_cpu = cpu; | 4876 | env->new_dst_cpu = cpu; |
4016 | break; | 4877 | break; |
4017 | } | 4878 | } |
@@ -4030,11 +4891,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
4030 | 4891 | ||
4031 | /* | 4892 | /* |
4032 | * Aggressive migration if: | 4893 | * Aggressive migration if: |
4033 | * 1) task is cache cold, or | 4894 | * 1) destination numa is preferred |
4034 | * 2) too many balance attempts have failed. | 4895 | * 2) task is cache cold, or |
4896 | * 3) too many balance attempts have failed. | ||
4035 | */ | 4897 | */ |
4036 | |||
4037 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); | 4898 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
4899 | if (!tsk_cache_hot) | ||
4900 | tsk_cache_hot = migrate_degrades_locality(p, env); | ||
4901 | |||
4902 | if (migrate_improves_locality(p, env)) { | ||
4903 | #ifdef CONFIG_SCHEDSTATS | ||
4904 | if (tsk_cache_hot) { | ||
4905 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
4906 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
4907 | } | ||
4908 | #endif | ||
4909 | return 1; | ||
4910 | } | ||
4911 | |||
4038 | if (!tsk_cache_hot || | 4912 | if (!tsk_cache_hot || |
4039 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 4913 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
4040 | 4914 | ||
@@ -4077,8 +4951,6 @@ static int move_one_task(struct lb_env *env) | |||
4077 | return 0; | 4951 | return 0; |
4078 | } | 4952 | } |
4079 | 4953 | ||
4080 | static unsigned long task_h_load(struct task_struct *p); | ||
4081 | |||
4082 | static const unsigned int sched_nr_migrate_break = 32; | 4954 | static const unsigned int sched_nr_migrate_break = 32; |
4083 | 4955 | ||
4084 | /* | 4956 | /* |
@@ -4291,6 +5163,10 @@ struct sg_lb_stats { | |||
4291 | unsigned int group_weight; | 5163 | unsigned int group_weight; |
4292 | int group_imb; /* Is there an imbalance in the group ? */ | 5164 | int group_imb; /* Is there an imbalance in the group ? */ |
4293 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5165 | int group_has_capacity; /* Is there extra capacity in the group? */ |
5166 | #ifdef CONFIG_NUMA_BALANCING | ||
5167 | unsigned int nr_numa_running; | ||
5168 | unsigned int nr_preferred_running; | ||
5169 | #endif | ||
4294 | }; | 5170 | }; |
4295 | 5171 | ||
4296 | /* | 5172 | /* |
@@ -4447,7 +5323,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4447 | { | 5323 | { |
4448 | struct sched_domain *child = sd->child; | 5324 | struct sched_domain *child = sd->child; |
4449 | struct sched_group *group, *sdg = sd->groups; | 5325 | struct sched_group *group, *sdg = sd->groups; |
4450 | unsigned long power; | 5326 | unsigned long power, power_orig; |
4451 | unsigned long interval; | 5327 | unsigned long interval; |
4452 | 5328 | ||
4453 | interval = msecs_to_jiffies(sd->balance_interval); | 5329 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -4459,7 +5335,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4459 | return; | 5335 | return; |
4460 | } | 5336 | } |
4461 | 5337 | ||
4462 | power = 0; | 5338 | power_orig = power = 0; |
4463 | 5339 | ||
4464 | if (child->flags & SD_OVERLAP) { | 5340 | if (child->flags & SD_OVERLAP) { |
4465 | /* | 5341 | /* |
@@ -4467,8 +5343,12 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4467 | * span the current group. | 5343 | * span the current group. |
4468 | */ | 5344 | */ |
4469 | 5345 | ||
4470 | for_each_cpu(cpu, sched_group_cpus(sdg)) | 5346 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
4471 | power += power_of(cpu); | 5347 | struct sched_group *sg = cpu_rq(cpu)->sd->groups; |
5348 | |||
5349 | power_orig += sg->sgp->power_orig; | ||
5350 | power += sg->sgp->power; | ||
5351 | } | ||
4472 | } else { | 5352 | } else { |
4473 | /* | 5353 | /* |
4474 | * !SD_OVERLAP domains can assume that child groups | 5354 | * !SD_OVERLAP domains can assume that child groups |
@@ -4477,12 +5357,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
4477 | 5357 | ||
4478 | group = child->groups; | 5358 | group = child->groups; |
4479 | do { | 5359 | do { |
5360 | power_orig += group->sgp->power_orig; | ||
4480 | power += group->sgp->power; | 5361 | power += group->sgp->power; |
4481 | group = group->next; | 5362 | group = group->next; |
4482 | } while (group != child->groups); | 5363 | } while (group != child->groups); |
4483 | } | 5364 | } |
4484 | 5365 | ||
4485 | sdg->sgp->power_orig = sdg->sgp->power = power; | 5366 | sdg->sgp->power_orig = power_orig; |
5367 | sdg->sgp->power = power; | ||
4486 | } | 5368 | } |
4487 | 5369 | ||
4488 | /* | 5370 | /* |
@@ -4526,13 +5408,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | 5408 | * cpu 3 and leave one of the cpus in the second group unused. |
4527 | * | 5409 | * |
4528 | * The current solution to this issue is detecting the skew in the first group | 5410 | * The current solution to this issue is detecting the skew in the first group |
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | 5411 | * by noticing the lower domain failed to reach balance and had difficulty |
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | 5412 | * moving tasks due to affinity constraints. |
4531 | * sg_imbalanced(). | ||
4532 | * | 5413 | * |
4533 | * When this is so detected; this group becomes a candidate for busiest; see | 5414 | * When this is so detected; this group becomes a candidate for busiest; see |
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | 5415 | * update_sd_pick_busiest(). And calculcate_imbalance() and |
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | 5416 | * find_busiest_group() avoid some of the usual balance conditions to allow it |
4536 | * to create an effective group imbalance. | 5417 | * to create an effective group imbalance. |
4537 | * | 5418 | * |
4538 | * This is a somewhat tricky proposition since the next run might not find the | 5419 | * This is a somewhat tricky proposition since the next run might not find the |
@@ -4540,49 +5421,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4540 | * subtle and fragile situation. | 5421 | * subtle and fragile situation. |
4541 | */ | 5422 | */ |
4542 | 5423 | ||
4543 | struct sg_imb_stats { | 5424 | static inline int sg_imbalanced(struct sched_group *group) |
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | 5425 | { |
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | 5426 | return group->sgp->imbalance; |
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | 5427 | } |
4553 | 5428 | ||
4554 | static inline void | 5429 | /* |
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | 5430 | * Compute the group capacity. |
4556 | unsigned long load, unsigned long nr_running) | 5431 | * |
5432 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | ||
5433 | * first dividing out the smt factor and computing the actual number of cores | ||
5434 | * and limit power unit capacity with that. | ||
5435 | */ | ||
5436 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | ||
4557 | { | 5437 | { |
4558 | if (load > sgi->max_cpu_load) | 5438 | unsigned int capacity, smt, cpus; |
4559 | sgi->max_cpu_load = load; | 5439 | unsigned int power, power_orig; |
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | 5440 | ||
4563 | if (nr_running > sgi->max_nr_running) | 5441 | power = group->sgp->power; |
4564 | sgi->max_nr_running = nr_running; | 5442 | power_orig = group->sgp->power_orig; |
4565 | if (sgi->min_nr_running > nr_running) | 5443 | cpus = group->group_weight; |
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | 5444 | ||
4569 | static inline int | 5445 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ |
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | 5446 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); |
4571 | { | 5447 | capacity = cpus / smt; /* cores */ |
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | 5448 | ||
4585 | return 0; | 5449 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); |
5450 | if (!capacity) | ||
5451 | capacity = fix_small_capacity(env->sd, group); | ||
5452 | |||
5453 | return capacity; | ||
4586 | } | 5454 | } |
4587 | 5455 | ||
4588 | /** | 5456 | /** |
@@ -4597,12 +5465,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4597 | struct sched_group *group, int load_idx, | 5465 | struct sched_group *group, int load_idx, |
4598 | int local_group, struct sg_lb_stats *sgs) | 5466 | int local_group, struct sg_lb_stats *sgs) |
4599 | { | 5467 | { |
4600 | struct sg_imb_stats sgi; | ||
4601 | unsigned long nr_running; | 5468 | unsigned long nr_running; |
4602 | unsigned long load; | 5469 | unsigned long load; |
4603 | int i; | 5470 | int i; |
4604 | 5471 | ||
4605 | init_sg_imb_stats(&sgi); | 5472 | memset(sgs, 0, sizeof(*sgs)); |
4606 | 5473 | ||
4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5474 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4608 | struct rq *rq = cpu_rq(i); | 5475 | struct rq *rq = cpu_rq(i); |
@@ -4610,24 +5477,22 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4610 | nr_running = rq->nr_running; | 5477 | nr_running = rq->nr_running; |
4611 | 5478 | ||
4612 | /* Bias balancing toward cpus of our domain */ | 5479 | /* Bias balancing toward cpus of our domain */ |
4613 | if (local_group) { | 5480 | if (local_group) |
4614 | load = target_load(i, load_idx); | 5481 | load = target_load(i, load_idx); |
4615 | } else { | 5482 | else |
4616 | load = source_load(i, load_idx); | 5483 | load = source_load(i, load_idx); |
4617 | update_sg_imb_stats(&sgi, load, nr_running); | ||
4618 | } | ||
4619 | 5484 | ||
4620 | sgs->group_load += load; | 5485 | sgs->group_load += load; |
4621 | sgs->sum_nr_running += nr_running; | 5486 | sgs->sum_nr_running += nr_running; |
5487 | #ifdef CONFIG_NUMA_BALANCING | ||
5488 | sgs->nr_numa_running += rq->nr_numa_running; | ||
5489 | sgs->nr_preferred_running += rq->nr_preferred_running; | ||
5490 | #endif | ||
4622 | sgs->sum_weighted_load += weighted_cpuload(i); | 5491 | sgs->sum_weighted_load += weighted_cpuload(i); |
4623 | if (idle_cpu(i)) | 5492 | if (idle_cpu(i)) |
4624 | sgs->idle_cpus++; | 5493 | sgs->idle_cpus++; |
4625 | } | 5494 | } |
4626 | 5495 | ||
4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || | ||
4628 | time_after_eq(jiffies, group->sgp->next_update))) | ||
4629 | update_group_power(env->sd, env->dst_cpu); | ||
4630 | |||
4631 | /* Adjust by relative CPU power of the group */ | 5496 | /* Adjust by relative CPU power of the group */ |
4632 | sgs->group_power = group->sgp->power; | 5497 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 5498 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; |
@@ -4635,16 +5500,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4635 | if (sgs->sum_nr_running) | 5500 | if (sgs->sum_nr_running) |
4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5501 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4637 | 5502 | ||
4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
4639 | |||
4640 | sgs->group_capacity = | ||
4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
4642 | |||
4643 | if (!sgs->group_capacity) | ||
4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); | ||
4645 | |||
4646 | sgs->group_weight = group->group_weight; | 5503 | sgs->group_weight = group->group_weight; |
4647 | 5504 | ||
5505 | sgs->group_imb = sg_imbalanced(group); | ||
5506 | sgs->group_capacity = sg_capacity(env, group); | ||
5507 | |||
4648 | if (sgs->group_capacity > sgs->sum_nr_running) | 5508 | if (sgs->group_capacity > sgs->sum_nr_running) |
4649 | sgs->group_has_capacity = 1; | 5509 | sgs->group_has_capacity = 1; |
4650 | } | 5510 | } |
@@ -4693,14 +5553,43 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4693 | return false; | 5553 | return false; |
4694 | } | 5554 | } |
4695 | 5555 | ||
5556 | #ifdef CONFIG_NUMA_BALANCING | ||
5557 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
5558 | { | ||
5559 | if (sgs->sum_nr_running > sgs->nr_numa_running) | ||
5560 | return regular; | ||
5561 | if (sgs->sum_nr_running > sgs->nr_preferred_running) | ||
5562 | return remote; | ||
5563 | return all; | ||
5564 | } | ||
5565 | |||
5566 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
5567 | { | ||
5568 | if (rq->nr_running > rq->nr_numa_running) | ||
5569 | return regular; | ||
5570 | if (rq->nr_running > rq->nr_preferred_running) | ||
5571 | return remote; | ||
5572 | return all; | ||
5573 | } | ||
5574 | #else | ||
5575 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
5576 | { | ||
5577 | return all; | ||
5578 | } | ||
5579 | |||
5580 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
5581 | { | ||
5582 | return regular; | ||
5583 | } | ||
5584 | #endif /* CONFIG_NUMA_BALANCING */ | ||
5585 | |||
4696 | /** | 5586 | /** |
4697 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 5587 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
4698 | * @env: The load balancing environment. | 5588 | * @env: The load balancing environment. |
4699 | * @balance: Should we balance. | 5589 | * @balance: Should we balance. |
4700 | * @sds: variable to hold the statistics for this sched_domain. | 5590 | * @sds: variable to hold the statistics for this sched_domain. |
4701 | */ | 5591 | */ |
4702 | static inline void update_sd_lb_stats(struct lb_env *env, | 5592 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
4703 | struct sd_lb_stats *sds) | ||
4704 | { | 5593 | { |
4705 | struct sched_domain *child = env->sd->child; | 5594 | struct sched_domain *child = env->sd->child; |
4706 | struct sched_group *sg = env->sd->groups; | 5595 | struct sched_group *sg = env->sd->groups; |
@@ -4720,11 +5609,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4720 | if (local_group) { | 5609 | if (local_group) { |
4721 | sds->local = sg; | 5610 | sds->local = sg; |
4722 | sgs = &sds->local_stat; | 5611 | sgs = &sds->local_stat; |
5612 | |||
5613 | if (env->idle != CPU_NEWLY_IDLE || | ||
5614 | time_after_eq(jiffies, sg->sgp->next_update)) | ||
5615 | update_group_power(env->sd, env->dst_cpu); | ||
4723 | } | 5616 | } |
4724 | 5617 | ||
4725 | memset(sgs, 0, sizeof(*sgs)); | ||
4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 5618 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4727 | 5619 | ||
5620 | if (local_group) | ||
5621 | goto next_group; | ||
5622 | |||
4728 | /* | 5623 | /* |
4729 | * In case the child domain prefers tasks go to siblings | 5624 | * In case the child domain prefers tasks go to siblings |
4730 | * first, lower the sg capacity to one so that we'll try | 5625 | * first, lower the sg capacity to one so that we'll try |
@@ -4735,21 +5630,25 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4735 | * heaviest group when it is already under-utilized (possible | 5630 | * heaviest group when it is already under-utilized (possible |
4736 | * with a large weight task outweighs the tasks on the system). | 5631 | * with a large weight task outweighs the tasks on the system). |
4737 | */ | 5632 | */ |
4738 | if (prefer_sibling && !local_group && | 5633 | if (prefer_sibling && sds->local && |
4739 | sds->local && sds->local_stat.group_has_capacity) | 5634 | sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | 5635 | sgs->group_capacity = min(sgs->group_capacity, 1U); |
4741 | 5636 | ||
4742 | /* Now, start updating sd_lb_stats */ | 5637 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
4743 | sds->total_load += sgs->group_load; | ||
4744 | sds->total_pwr += sgs->group_power; | ||
4745 | |||
4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { | ||
4747 | sds->busiest = sg; | 5638 | sds->busiest = sg; |
4748 | sds->busiest_stat = *sgs; | 5639 | sds->busiest_stat = *sgs; |
4749 | } | 5640 | } |
4750 | 5641 | ||
5642 | next_group: | ||
5643 | /* Now, start updating sd_lb_stats */ | ||
5644 | sds->total_load += sgs->group_load; | ||
5645 | sds->total_pwr += sgs->group_power; | ||
5646 | |||
4751 | sg = sg->next; | 5647 | sg = sg->next; |
4752 | } while (sg != env->sd->groups); | 5648 | } while (sg != env->sd->groups); |
5649 | |||
5650 | if (env->sd->flags & SD_NUMA) | ||
5651 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | ||
4753 | } | 5652 | } |
4754 | 5653 | ||
4755 | /** | 5654 | /** |
@@ -5053,15 +5952,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5053 | int i; | 5952 | int i; |
5054 | 5953 | ||
5055 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5954 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5056 | unsigned long power = power_of(i); | 5955 | unsigned long power, capacity, wl; |
5057 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5956 | enum fbq_type rt; |
5058 | SCHED_POWER_SCALE); | 5957 | |
5059 | unsigned long wl; | 5958 | rq = cpu_rq(i); |
5959 | rt = fbq_classify_rq(rq); | ||
5060 | 5960 | ||
5961 | /* | ||
5962 | * We classify groups/runqueues into three groups: | ||
5963 | * - regular: there are !numa tasks | ||
5964 | * - remote: there are numa tasks that run on the 'wrong' node | ||
5965 | * - all: there is no distinction | ||
5966 | * | ||
5967 | * In order to avoid migrating ideally placed numa tasks, | ||
5968 | * ignore those when there's better options. | ||
5969 | * | ||
5970 | * If we ignore the actual busiest queue to migrate another | ||
5971 | * task, the next balance pass can still reduce the busiest | ||
5972 | * queue by moving tasks around inside the node. | ||
5973 | * | ||
5974 | * If we cannot move enough load due to this classification | ||
5975 | * the next pass will adjust the group classification and | ||
5976 | * allow migration of more tasks. | ||
5977 | * | ||
5978 | * Both cases only affect the total convergence complexity. | ||
5979 | */ | ||
5980 | if (rt > env->fbq_type) | ||
5981 | continue; | ||
5982 | |||
5983 | power = power_of(i); | ||
5984 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
5061 | if (!capacity) | 5985 | if (!capacity) |
5062 | capacity = fix_small_capacity(env->sd, group); | 5986 | capacity = fix_small_capacity(env->sd, group); |
5063 | 5987 | ||
5064 | rq = cpu_rq(i); | ||
5065 | wl = weighted_cpuload(i); | 5988 | wl = weighted_cpuload(i); |
5066 | 5989 | ||
5067 | /* | 5990 | /* |
@@ -5164,6 +6087,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5164 | int *continue_balancing) | 6087 | int *continue_balancing) |
5165 | { | 6088 | { |
5166 | int ld_moved, cur_ld_moved, active_balance = 0; | 6089 | int ld_moved, cur_ld_moved, active_balance = 0; |
6090 | struct sched_domain *sd_parent = sd->parent; | ||
5167 | struct sched_group *group; | 6091 | struct sched_group *group; |
5168 | struct rq *busiest; | 6092 | struct rq *busiest; |
5169 | unsigned long flags; | 6093 | unsigned long flags; |
@@ -5177,6 +6101,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5177 | .idle = idle, | 6101 | .idle = idle, |
5178 | .loop_break = sched_nr_migrate_break, | 6102 | .loop_break = sched_nr_migrate_break, |
5179 | .cpus = cpus, | 6103 | .cpus = cpus, |
6104 | .fbq_type = all, | ||
5180 | }; | 6105 | }; |
5181 | 6106 | ||
5182 | /* | 6107 | /* |
@@ -5268,17 +6193,17 @@ more_balance: | |||
5268 | * moreover subsequent load balance cycles should correct the | 6193 | * moreover subsequent load balance cycles should correct the |
5269 | * excess load moved. | 6194 | * excess load moved. |
5270 | */ | 6195 | */ |
5271 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6196 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
6197 | |||
6198 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
6199 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5272 | 6200 | ||
5273 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 6201 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
5274 | env.dst_cpu = env.new_dst_cpu; | 6202 | env.dst_cpu = env.new_dst_cpu; |
5275 | env.flags &= ~LBF_SOME_PINNED; | 6203 | env.flags &= ~LBF_DST_PINNED; |
5276 | env.loop = 0; | 6204 | env.loop = 0; |
5277 | env.loop_break = sched_nr_migrate_break; | 6205 | env.loop_break = sched_nr_migrate_break; |
5278 | 6206 | ||
5279 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
5280 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5281 | |||
5282 | /* | 6207 | /* |
5283 | * Go back to "more_balance" rather than "redo" since we | 6208 | * Go back to "more_balance" rather than "redo" since we |
5284 | * need to continue with same src_cpu. | 6209 | * need to continue with same src_cpu. |
@@ -5286,6 +6211,18 @@ more_balance: | |||
5286 | goto more_balance; | 6211 | goto more_balance; |
5287 | } | 6212 | } |
5288 | 6213 | ||
6214 | /* | ||
6215 | * We failed to reach balance because of affinity. | ||
6216 | */ | ||
6217 | if (sd_parent) { | ||
6218 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | ||
6219 | |||
6220 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | ||
6221 | *group_imbalance = 1; | ||
6222 | } else if (*group_imbalance) | ||
6223 | *group_imbalance = 0; | ||
6224 | } | ||
6225 | |||
5289 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6226 | /* All tasks on this runqueue were pinned by CPU affinity */ |
5290 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 6227 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
5291 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 6228 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
@@ -5393,6 +6330,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5393 | struct sched_domain *sd; | 6330 | struct sched_domain *sd; |
5394 | int pulled_task = 0; | 6331 | int pulled_task = 0; |
5395 | unsigned long next_balance = jiffies + HZ; | 6332 | unsigned long next_balance = jiffies + HZ; |
6333 | u64 curr_cost = 0; | ||
5396 | 6334 | ||
5397 | this_rq->idle_stamp = rq_clock(this_rq); | 6335 | this_rq->idle_stamp = rq_clock(this_rq); |
5398 | 6336 | ||
@@ -5409,15 +6347,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5409 | for_each_domain(this_cpu, sd) { | 6347 | for_each_domain(this_cpu, sd) { |
5410 | unsigned long interval; | 6348 | unsigned long interval; |
5411 | int continue_balancing = 1; | 6349 | int continue_balancing = 1; |
6350 | u64 t0, domain_cost; | ||
5412 | 6351 | ||
5413 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6352 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5414 | continue; | 6353 | continue; |
5415 | 6354 | ||
6355 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | ||
6356 | break; | ||
6357 | |||
5416 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6358 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
6359 | t0 = sched_clock_cpu(this_cpu); | ||
6360 | |||
5417 | /* If we've pulled tasks over stop searching: */ | 6361 | /* If we've pulled tasks over stop searching: */ |
5418 | pulled_task = load_balance(this_cpu, this_rq, | 6362 | pulled_task = load_balance(this_cpu, this_rq, |
5419 | sd, CPU_NEWLY_IDLE, | 6363 | sd, CPU_NEWLY_IDLE, |
5420 | &continue_balancing); | 6364 | &continue_balancing); |
6365 | |||
6366 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
6367 | if (domain_cost > sd->max_newidle_lb_cost) | ||
6368 | sd->max_newidle_lb_cost = domain_cost; | ||
6369 | |||
6370 | curr_cost += domain_cost; | ||
5421 | } | 6371 | } |
5422 | 6372 | ||
5423 | interval = msecs_to_jiffies(sd->balance_interval); | 6373 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5439,6 +6389,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5439 | */ | 6389 | */ |
5440 | this_rq->next_balance = next_balance; | 6390 | this_rq->next_balance = next_balance; |
5441 | } | 6391 | } |
6392 | |||
6393 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
6394 | this_rq->max_idle_balance_cost = curr_cost; | ||
5442 | } | 6395 | } |
5443 | 6396 | ||
5444 | /* | 6397 | /* |
@@ -5662,15 +6615,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5662 | /* Earliest time when we have to do rebalance again */ | 6615 | /* Earliest time when we have to do rebalance again */ |
5663 | unsigned long next_balance = jiffies + 60*HZ; | 6616 | unsigned long next_balance = jiffies + 60*HZ; |
5664 | int update_next_balance = 0; | 6617 | int update_next_balance = 0; |
5665 | int need_serialize; | 6618 | int need_serialize, need_decay = 0; |
6619 | u64 max_cost = 0; | ||
5666 | 6620 | ||
5667 | update_blocked_averages(cpu); | 6621 | update_blocked_averages(cpu); |
5668 | 6622 | ||
5669 | rcu_read_lock(); | 6623 | rcu_read_lock(); |
5670 | for_each_domain(cpu, sd) { | 6624 | for_each_domain(cpu, sd) { |
6625 | /* | ||
6626 | * Decay the newidle max times here because this is a regular | ||
6627 | * visit to all the domains. Decay ~1% per second. | ||
6628 | */ | ||
6629 | if (time_after(jiffies, sd->next_decay_max_lb_cost)) { | ||
6630 | sd->max_newidle_lb_cost = | ||
6631 | (sd->max_newidle_lb_cost * 253) / 256; | ||
6632 | sd->next_decay_max_lb_cost = jiffies + HZ; | ||
6633 | need_decay = 1; | ||
6634 | } | ||
6635 | max_cost += sd->max_newidle_lb_cost; | ||
6636 | |||
5671 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6637 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5672 | continue; | 6638 | continue; |
5673 | 6639 | ||
6640 | /* | ||
6641 | * Stop the load balance at this level. There is another | ||
6642 | * CPU in our sched group which is doing load balancing more | ||
6643 | * actively. | ||
6644 | */ | ||
6645 | if (!continue_balancing) { | ||
6646 | if (need_decay) | ||
6647 | continue; | ||
6648 | break; | ||
6649 | } | ||
6650 | |||
5674 | interval = sd->balance_interval; | 6651 | interval = sd->balance_interval; |
5675 | if (idle != CPU_IDLE) | 6652 | if (idle != CPU_IDLE) |
5676 | interval *= sd->busy_factor; | 6653 | interval *= sd->busy_factor; |
@@ -5689,7 +6666,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5689 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 6666 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5690 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { | 6667 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5691 | /* | 6668 | /* |
5692 | * The LBF_SOME_PINNED logic could have changed | 6669 | * The LBF_DST_PINNED logic could have changed |
5693 | * env->dst_cpu, so we can't know our idle | 6670 | * env->dst_cpu, so we can't know our idle |
5694 | * state even if we migrated tasks. Update it. | 6671 | * state even if we migrated tasks. Update it. |
5695 | */ | 6672 | */ |
@@ -5704,14 +6681,14 @@ out: | |||
5704 | next_balance = sd->last_balance + interval; | 6681 | next_balance = sd->last_balance + interval; |
5705 | update_next_balance = 1; | 6682 | update_next_balance = 1; |
5706 | } | 6683 | } |
5707 | 6684 | } | |
6685 | if (need_decay) { | ||
5708 | /* | 6686 | /* |
5709 | * Stop the load balance at this level. There is another | 6687 | * Ensure the rq-wide value also decays but keep it at a |
5710 | * CPU in our sched group which is doing load balancing more | 6688 | * reasonable floor to avoid funnies with rq->avg_idle. |
5711 | * actively. | ||
5712 | */ | 6689 | */ |
5713 | if (!continue_balancing) | 6690 | rq->max_idle_balance_cost = |
5714 | break; | 6691 | max((u64)sysctl_sched_migration_cost, max_cost); |
5715 | } | 6692 | } |
5716 | rcu_read_unlock(); | 6693 | rcu_read_unlock(); |
5717 | 6694 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e4799..5716929a2e3a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false) | |||
63 | /* | 63 | /* |
64 | * Apply the automatic NUMA scheduling policy. Enabled automatically | 64 | * Apply the automatic NUMA scheduling policy. Enabled automatically |
65 | * at runtime if running on a NUMA machine. Can be controlled via | 65 | * at runtime if running on a NUMA machine. Can be controlled via |
66 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | 66 | * numa_balancing= |
67 | * for debugging the core machinery. | ||
68 | */ | 67 | */ |
69 | #ifdef CONFIG_NUMA_BALANCING | 68 | #ifdef CONFIG_NUMA_BALANCING |
70 | SCHED_FEAT(NUMA, false) | 69 | SCHED_FEAT(NUMA, false) |
71 | SCHED_FEAT(NUMA_FORCE, false) | 70 | |
71 | /* | ||
72 | * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a | ||
73 | * higher number of hinting faults are recorded during active load | ||
74 | * balancing. | ||
75 | */ | ||
76 | SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) | ||
77 | |||
78 | /* | ||
79 | * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a | ||
80 | * lower number of hinting faults have been recorded. As this has | ||
81 | * the potential to prevent a task ever migrating to a new node | ||
82 | * due to CPU overload it is disabled by default. | ||
83 | */ | ||
84 | SCHED_FEAT(NUMA_RESIST_LOWER, false) | ||
72 | #endif | 85 | #endif |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..516c3d9ceea1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -9,7 +9,7 @@ | |||
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
11 | static int | 11 | static int |
12 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | 12 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) |
13 | { | 13 | { |
14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
15 | } | 15 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..e9304cdc26fe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1169,13 +1169,10 @@ static void yield_task_rt(struct rq *rq) | |||
1169 | static int find_lowest_rq(struct task_struct *task); | 1169 | static int find_lowest_rq(struct task_struct *task); |
1170 | 1170 | ||
1171 | static int | 1171 | static int |
1172 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | 1172 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) |
1173 | { | 1173 | { |
1174 | struct task_struct *curr; | 1174 | struct task_struct *curr; |
1175 | struct rq *rq; | 1175 | struct rq *rq; |
1176 | int cpu; | ||
1177 | |||
1178 | cpu = task_cpu(p); | ||
1179 | 1176 | ||
1180 | if (p->nr_cpus_allowed == 1) | 1177 | if (p->nr_cpus_allowed == 1) |
1181 | goto out; | 1178 | goto out; |
@@ -1213,8 +1210,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1213 | */ | 1210 | */ |
1214 | if (curr && unlikely(rt_task(curr)) && | 1211 | if (curr && unlikely(rt_task(curr)) && |
1215 | (curr->nr_cpus_allowed < 2 || | 1212 | (curr->nr_cpus_allowed < 2 || |
1216 | curr->prio <= p->prio) && | 1213 | curr->prio <= p->prio)) { |
1217 | (p->nr_cpus_allowed > 1)) { | ||
1218 | int target = find_lowest_rq(p); | 1214 | int target = find_lowest_rq(p); |
1219 | 1215 | ||
1220 | if (target != -1) | 1216 | if (target != -1) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dca..d69cb325c27e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
8 | #include <linux/tick.h> | 8 | #include <linux/tick.h> |
9 | #include <linux/slab.h> | ||
9 | 10 | ||
10 | #include "cpupri.h" | 11 | #include "cpupri.h" |
11 | #include "cpuacct.h" | 12 | #include "cpuacct.h" |
@@ -408,6 +409,10 @@ struct rq { | |||
408 | * remote CPUs use both these fields when doing load calculation. | 409 | * remote CPUs use both these fields when doing load calculation. |
409 | */ | 410 | */ |
410 | unsigned int nr_running; | 411 | unsigned int nr_running; |
412 | #ifdef CONFIG_NUMA_BALANCING | ||
413 | unsigned int nr_numa_running; | ||
414 | unsigned int nr_preferred_running; | ||
415 | #endif | ||
411 | #define CPU_LOAD_IDX_MAX 5 | 416 | #define CPU_LOAD_IDX_MAX 5 |
412 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 417 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
413 | unsigned long last_load_update_tick; | 418 | unsigned long last_load_update_tick; |
@@ -476,6 +481,9 @@ struct rq { | |||
476 | u64 age_stamp; | 481 | u64 age_stamp; |
477 | u64 idle_stamp; | 482 | u64 idle_stamp; |
478 | u64 avg_idle; | 483 | u64 avg_idle; |
484 | |||
485 | /* This is used to determine avg_idle's max value */ | ||
486 | u64 max_idle_balance_cost; | ||
479 | #endif | 487 | #endif |
480 | 488 | ||
481 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 489 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
552 | return rq->clock_task; | 560 | return rq->clock_task; |
553 | } | 561 | } |
554 | 562 | ||
563 | #ifdef CONFIG_NUMA_BALANCING | ||
564 | extern void sched_setnuma(struct task_struct *p, int node); | ||
565 | extern int migrate_task_to(struct task_struct *p, int cpu); | ||
566 | extern int migrate_swap(struct task_struct *, struct task_struct *); | ||
567 | #endif /* CONFIG_NUMA_BALANCING */ | ||
568 | |||
555 | #ifdef CONFIG_SMP | 569 | #ifdef CONFIG_SMP |
556 | 570 | ||
557 | #define rcu_dereference_check_sched_domain(p) \ | 571 | #define rcu_dereference_check_sched_domain(p) \ |
@@ -593,9 +607,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
593 | return hsd; | 607 | return hsd; |
594 | } | 608 | } |
595 | 609 | ||
610 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
611 | { | ||
612 | struct sched_domain *sd; | ||
613 | |||
614 | for_each_domain(cpu, sd) { | ||
615 | if (sd->flags & flag) | ||
616 | break; | ||
617 | } | ||
618 | |||
619 | return sd; | ||
620 | } | ||
621 | |||
596 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 622 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
597 | DECLARE_PER_CPU(int, sd_llc_size); | 623 | DECLARE_PER_CPU(int, sd_llc_size); |
598 | DECLARE_PER_CPU(int, sd_llc_id); | 624 | DECLARE_PER_CPU(int, sd_llc_id); |
625 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | ||
599 | 626 | ||
600 | struct sched_group_power { | 627 | struct sched_group_power { |
601 | atomic_t ref; | 628 | atomic_t ref; |
@@ -605,6 +632,7 @@ struct sched_group_power { | |||
605 | */ | 632 | */ |
606 | unsigned int power, power_orig; | 633 | unsigned int power, power_orig; |
607 | unsigned long next_update; | 634 | unsigned long next_update; |
635 | int imbalance; /* XXX unrelated to power but shared group state */ | ||
608 | /* | 636 | /* |
609 | * Number of busy cpus in this group. | 637 | * Number of busy cpus in this group. |
610 | */ | 638 | */ |
@@ -719,6 +747,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
719 | */ | 747 | */ |
720 | smp_wmb(); | 748 | smp_wmb(); |
721 | task_thread_info(p)->cpu = cpu; | 749 | task_thread_info(p)->cpu = cpu; |
750 | p->wake_cpu = cpu; | ||
722 | #endif | 751 | #endif |
723 | } | 752 | } |
724 | 753 | ||
@@ -974,7 +1003,7 @@ struct sched_class { | |||
974 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1003 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
975 | 1004 | ||
976 | #ifdef CONFIG_SMP | 1005 | #ifdef CONFIG_SMP |
977 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | 1006 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
978 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1007 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); |
979 | 1008 | ||
980 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1009 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce8..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
60 | * delta taken on each cpu would annul the skew. | 60 | * delta taken on each cpu would annul the skew. |
61 | */ | 61 | */ |
62 | static inline void sched_info_dequeued(struct task_struct *t) | 62 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
63 | { | 63 | { |
64 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 64 | unsigned long long now = rq_clock(rq), delta = 0; |
65 | 65 | ||
66 | if (unlikely(sched_info_on())) | 66 | if (unlikely(sched_info_on())) |
67 | if (t->sched_info.last_queued) | 67 | if (t->sched_info.last_queued) |
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
69 | sched_info_reset_dequeued(t); | 69 | sched_info_reset_dequeued(t); |
70 | t->sched_info.run_delay += delta; | 70 | t->sched_info.run_delay += delta; |
71 | 71 | ||
72 | rq_sched_info_dequeued(task_rq(t), delta); | 72 | rq_sched_info_dequeued(rq, delta); |
73 | } | 73 | } |
74 | 74 | ||
75 | /* | 75 | /* |
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
77 | * long it was waiting to run. We also note when it began so that we | 77 | * long it was waiting to run. We also note when it began so that we |
78 | * can keep stats on how long its timeslice is. | 78 | * can keep stats on how long its timeslice is. |
79 | */ | 79 | */ |
80 | static void sched_info_arrive(struct task_struct *t) | 80 | static void sched_info_arrive(struct rq *rq, struct task_struct *t) |
81 | { | 81 | { |
82 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 82 | unsigned long long now = rq_clock(rq), delta = 0; |
83 | 83 | ||
84 | if (t->sched_info.last_queued) | 84 | if (t->sched_info.last_queued) |
85 | delta = now - t->sched_info.last_queued; | 85 | delta = now - t->sched_info.last_queued; |
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
88 | t->sched_info.last_arrival = now; | 88 | t->sched_info.last_arrival = now; |
89 | t->sched_info.pcount++; | 89 | t->sched_info.pcount++; |
90 | 90 | ||
91 | rq_sched_info_arrive(task_rq(t), delta); | 91 | rq_sched_info_arrive(rq, delta); |
92 | } | 92 | } |
93 | 93 | ||
94 | /* | 94 | /* |
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t) | |||
96 | * the timestamp if it is already not set. It's assumed that | 96 | * the timestamp if it is already not set. It's assumed that |
97 | * sched_info_dequeued() will clear that stamp when appropriate. | 97 | * sched_info_dequeued() will clear that stamp when appropriate. |
98 | */ | 98 | */ |
99 | static inline void sched_info_queued(struct task_struct *t) | 99 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
100 | { | 100 | { |
101 | if (unlikely(sched_info_on())) | 101 | if (unlikely(sched_info_on())) |
102 | if (!t->sched_info.last_queued) | 102 | if (!t->sched_info.last_queued) |
103 | t->sched_info.last_queued = rq_clock(task_rq(t)); | 103 | t->sched_info.last_queued = rq_clock(rq); |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t) | |||
111 | * sched_info_queued() to mark that it has now again started waiting on | 111 | * sched_info_queued() to mark that it has now again started waiting on |
112 | * the runqueue. | 112 | * the runqueue. |
113 | */ | 113 | */ |
114 | static inline void sched_info_depart(struct task_struct *t) | 114 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
115 | { | 115 | { |
116 | unsigned long long delta = rq_clock(task_rq(t)) - | 116 | unsigned long long delta = rq_clock(rq) - |
117 | t->sched_info.last_arrival; | 117 | t->sched_info.last_arrival; |
118 | 118 | ||
119 | rq_sched_info_depart(task_rq(t), delta); | 119 | rq_sched_info_depart(rq, delta); |
120 | 120 | ||
121 | if (t->state == TASK_RUNNING) | 121 | if (t->state == TASK_RUNNING) |
122 | sched_info_queued(t); | 122 | sched_info_queued(rq, t); |
123 | } | 123 | } |
124 | 124 | ||
125 | /* | 125 | /* |
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) | |||
128 | * the idle task.) We are only called when prev != next. | 128 | * the idle task.) We are only called when prev != next. |
129 | */ | 129 | */ |
130 | static inline void | 130 | static inline void |
131 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | 131 | __sched_info_switch(struct rq *rq, |
132 | struct task_struct *prev, struct task_struct *next) | ||
132 | { | 133 | { |
133 | struct rq *rq = task_rq(prev); | ||
134 | |||
135 | /* | 134 | /* |
136 | * prev now departs the cpu. It's not interesting to record | 135 | * prev now departs the cpu. It's not interesting to record |
137 | * stats about how efficient we were at scheduling the idle | 136 | * stats about how efficient we were at scheduling the idle |
138 | * process, however. | 137 | * process, however. |
139 | */ | 138 | */ |
140 | if (prev != rq->idle) | 139 | if (prev != rq->idle) |
141 | sched_info_depart(prev); | 140 | sched_info_depart(rq, prev); |
142 | 141 | ||
143 | if (next != rq->idle) | 142 | if (next != rq->idle) |
144 | sched_info_arrive(next); | 143 | sched_info_arrive(rq, next); |
145 | } | 144 | } |
146 | static inline void | 145 | static inline void |
147 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | 146 | sched_info_switch(struct rq *rq, |
147 | struct task_struct *prev, struct task_struct *next) | ||
148 | { | 148 | { |
149 | if (unlikely(sched_info_on())) | 149 | if (unlikely(sched_info_on())) |
150 | __sched_info_switch(prev, next); | 150 | __sched_info_switch(rq, prev, next); |
151 | } | 151 | } |
152 | #else | 152 | #else |
153 | #define sched_info_queued(t) do { } while (0) | 153 | #define sched_info_queued(rq, t) do { } while (0) |
154 | #define sched_info_reset_dequeued(t) do { } while (0) | 154 | #define sched_info_reset_dequeued(t) do { } while (0) |
155 | #define sched_info_dequeued(t) do { } while (0) | 155 | #define sched_info_dequeued(rq, t) do { } while (0) |
156 | #define sched_info_switch(t, next) do { } while (0) | 156 | #define sched_info_depart(rq, t) do { } while (0) |
157 | #define sched_info_arrive(rq, next) do { } while (0) | ||
158 | #define sched_info_switch(rq, t, next) do { } while (0) | ||
157 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 159 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
158 | 160 | ||
159 | /* | 161 | /* |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b9..47197de8abd9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -11,7 +11,7 @@ | |||
11 | 11 | ||
12 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
13 | static int | 13 | static int |
14 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) | 14 | select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) |
15 | { | 15 | { |
16 | return task_cpu(p); /* stop tasks as never migrate */ | 16 | return task_cpu(p); /* stop tasks as never migrate */ |
17 | } | 17 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..dcab1d3fb53d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -100,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
100 | 100 | ||
101 | raw_local_irq_save(flags); | 101 | raw_local_irq_save(flags); |
102 | /* | 102 | /* |
103 | * The preempt tracer hooks into add_preempt_count and will break | 103 | * The preempt tracer hooks into preempt_count_add and will break |
104 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET | 104 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET |
105 | * is set and before current->softirq_enabled is cleared. | 105 | * is set and before current->softirq_enabled is cleared. |
106 | * We must manually increment preempt_count here and manually | 106 | * We must manually increment preempt_count here and manually |
107 | * call the trace_preempt_off later. | 107 | * call the trace_preempt_off later. |
108 | */ | 108 | */ |
109 | preempt_count() += cnt; | 109 | __preempt_count_add(cnt); |
110 | /* | 110 | /* |
111 | * Were softirqs turned off above: | 111 | * Were softirqs turned off above: |
112 | */ | 112 | */ |
@@ -120,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
120 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 120 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
121 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | 121 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
122 | { | 122 | { |
123 | add_preempt_count(cnt); | 123 | preempt_count_add(cnt); |
124 | barrier(); | 124 | barrier(); |
125 | } | 125 | } |
126 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 126 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
@@ -139,7 +139,7 @@ static void __local_bh_enable(unsigned int cnt) | |||
139 | 139 | ||
140 | if (softirq_count() == cnt) | 140 | if (softirq_count() == cnt) |
141 | trace_softirqs_on(_RET_IP_); | 141 | trace_softirqs_on(_RET_IP_); |
142 | sub_preempt_count(cnt); | 142 | preempt_count_sub(cnt); |
143 | } | 143 | } |
144 | 144 | ||
145 | /* | 145 | /* |
@@ -169,12 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
169 | * Keep preemption disabled until we are done with | 169 | * Keep preemption disabled until we are done with |
170 | * softirq processing: | 170 | * softirq processing: |
171 | */ | 171 | */ |
172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); | 172 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); |
173 | 173 | ||
174 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) |
175 | do_softirq(); | 175 | do_softirq(); |
176 | 176 | ||
177 | dec_preempt_count(); | 177 | preempt_count_dec(); |
178 | #ifdef CONFIG_TRACE_IRQFLAGS | 178 | #ifdef CONFIG_TRACE_IRQFLAGS |
179 | local_irq_enable(); | 179 | local_irq_enable(); |
180 | #endif | 180 | #endif |
@@ -256,7 +256,7 @@ restart: | |||
256 | " exited with %08x?\n", vec_nr, | 256 | " exited with %08x?\n", vec_nr, |
257 | softirq_to_name[vec_nr], h->action, | 257 | softirq_to_name[vec_nr], h->action, |
258 | prev_count, preempt_count()); | 258 | prev_count, preempt_count()); |
259 | preempt_count() = prev_count; | 259 | preempt_count_set(prev_count); |
260 | } | 260 | } |
261 | 261 | ||
262 | rcu_bh_qs(cpu); | 262 | rcu_bh_qs(cpu); |
@@ -369,7 +369,7 @@ void irq_exit(void) | |||
369 | 369 | ||
370 | account_irq_exit_time(current); | 370 | account_irq_exit_time(current); |
371 | trace_hardirq_exit(); | 371 | trace_hardirq_exit(); |
372 | sub_preempt_count(HARDIRQ_OFFSET); | 372 | preempt_count_sub(HARDIRQ_OFFSET); |
373 | if (!in_interrupt() && local_softirq_pending()) | 373 | if (!in_interrupt() && local_softirq_pending()) |
374 | invoke_softirq(); | 374 | invoke_softirq(); |
375 | 375 | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae30..32a6c44d8f78 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -115,6 +115,166 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
115 | return done.executed ? done.ret : -ENOENT; | 115 | return done.executed ? done.ret : -ENOENT; |
116 | } | 116 | } |
117 | 117 | ||
118 | /* This controls the threads on each CPU. */ | ||
119 | enum multi_stop_state { | ||
120 | /* Dummy starting state for thread. */ | ||
121 | MULTI_STOP_NONE, | ||
122 | /* Awaiting everyone to be scheduled. */ | ||
123 | MULTI_STOP_PREPARE, | ||
124 | /* Disable interrupts. */ | ||
125 | MULTI_STOP_DISABLE_IRQ, | ||
126 | /* Run the function */ | ||
127 | MULTI_STOP_RUN, | ||
128 | /* Exit */ | ||
129 | MULTI_STOP_EXIT, | ||
130 | }; | ||
131 | |||
132 | struct multi_stop_data { | ||
133 | int (*fn)(void *); | ||
134 | void *data; | ||
135 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
136 | unsigned int num_threads; | ||
137 | const struct cpumask *active_cpus; | ||
138 | |||
139 | enum multi_stop_state state; | ||
140 | atomic_t thread_ack; | ||
141 | }; | ||
142 | |||
143 | static void set_state(struct multi_stop_data *msdata, | ||
144 | enum multi_stop_state newstate) | ||
145 | { | ||
146 | /* Reset ack counter. */ | ||
147 | atomic_set(&msdata->thread_ack, msdata->num_threads); | ||
148 | smp_wmb(); | ||
149 | msdata->state = newstate; | ||
150 | } | ||
151 | |||
152 | /* Last one to ack a state moves to the next state. */ | ||
153 | static void ack_state(struct multi_stop_data *msdata) | ||
154 | { | ||
155 | if (atomic_dec_and_test(&msdata->thread_ack)) | ||
156 | set_state(msdata, msdata->state + 1); | ||
157 | } | ||
158 | |||
159 | /* This is the cpu_stop function which stops the CPU. */ | ||
160 | static int multi_cpu_stop(void *data) | ||
161 | { | ||
162 | struct multi_stop_data *msdata = data; | ||
163 | enum multi_stop_state curstate = MULTI_STOP_NONE; | ||
164 | int cpu = smp_processor_id(), err = 0; | ||
165 | unsigned long flags; | ||
166 | bool is_active; | ||
167 | |||
168 | /* | ||
169 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
170 | * already be disabled. Save the state and restore it on exit. | ||
171 | */ | ||
172 | local_save_flags(flags); | ||
173 | |||
174 | if (!msdata->active_cpus) | ||
175 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
176 | else | ||
177 | is_active = cpumask_test_cpu(cpu, msdata->active_cpus); | ||
178 | |||
179 | /* Simple state machine */ | ||
180 | do { | ||
181 | /* Chill out and ensure we re-read multi_stop_state. */ | ||
182 | cpu_relax(); | ||
183 | if (msdata->state != curstate) { | ||
184 | curstate = msdata->state; | ||
185 | switch (curstate) { | ||
186 | case MULTI_STOP_DISABLE_IRQ: | ||
187 | local_irq_disable(); | ||
188 | hard_irq_disable(); | ||
189 | break; | ||
190 | case MULTI_STOP_RUN: | ||
191 | if (is_active) | ||
192 | err = msdata->fn(msdata->data); | ||
193 | break; | ||
194 | default: | ||
195 | break; | ||
196 | } | ||
197 | ack_state(msdata); | ||
198 | } | ||
199 | } while (curstate != MULTI_STOP_EXIT); | ||
200 | |||
201 | local_irq_restore(flags); | ||
202 | return err; | ||
203 | } | ||
204 | |||
205 | struct irq_cpu_stop_queue_work_info { | ||
206 | int cpu1; | ||
207 | int cpu2; | ||
208 | struct cpu_stop_work *work1; | ||
209 | struct cpu_stop_work *work2; | ||
210 | }; | ||
211 | |||
212 | /* | ||
213 | * This function is always run with irqs and preemption disabled. | ||
214 | * This guarantees that both work1 and work2 get queued, before | ||
215 | * our local migrate thread gets the chance to preempt us. | ||
216 | */ | ||
217 | static void irq_cpu_stop_queue_work(void *arg) | ||
218 | { | ||
219 | struct irq_cpu_stop_queue_work_info *info = arg; | ||
220 | cpu_stop_queue_work(info->cpu1, info->work1); | ||
221 | cpu_stop_queue_work(info->cpu2, info->work2); | ||
222 | } | ||
223 | |||
224 | /** | ||
225 | * stop_two_cpus - stops two cpus | ||
226 | * @cpu1: the cpu to stop | ||
227 | * @cpu2: the other cpu to stop | ||
228 | * @fn: function to execute | ||
229 | * @arg: argument to @fn | ||
230 | * | ||
231 | * Stops both the current and specified CPU and runs @fn on one of them. | ||
232 | * | ||
233 | * returns when both are completed. | ||
234 | */ | ||
235 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) | ||
236 | { | ||
237 | int call_cpu; | ||
238 | struct cpu_stop_done done; | ||
239 | struct cpu_stop_work work1, work2; | ||
240 | struct irq_cpu_stop_queue_work_info call_args; | ||
241 | struct multi_stop_data msdata = { | ||
242 | .fn = fn, | ||
243 | .data = arg, | ||
244 | .num_threads = 2, | ||
245 | .active_cpus = cpumask_of(cpu1), | ||
246 | }; | ||
247 | |||
248 | work1 = work2 = (struct cpu_stop_work){ | ||
249 | .fn = multi_cpu_stop, | ||
250 | .arg = &msdata, | ||
251 | .done = &done | ||
252 | }; | ||
253 | |||
254 | call_args = (struct irq_cpu_stop_queue_work_info){ | ||
255 | .cpu1 = cpu1, | ||
256 | .cpu2 = cpu2, | ||
257 | .work1 = &work1, | ||
258 | .work2 = &work2, | ||
259 | }; | ||
260 | |||
261 | cpu_stop_init_done(&done, 2); | ||
262 | set_state(&msdata, MULTI_STOP_PREPARE); | ||
263 | |||
264 | /* | ||
265 | * Queuing needs to be done by the lowest numbered CPU, to ensure | ||
266 | * that works are always queued in the same order on every CPU. | ||
267 | * This prevents deadlocks. | ||
268 | */ | ||
269 | call_cpu = min(cpu1, cpu2); | ||
270 | |||
271 | smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work, | ||
272 | &call_args, 0); | ||
273 | |||
274 | wait_for_completion(&done.completion); | ||
275 | return done.executed ? done.ret : -ENOENT; | ||
276 | } | ||
277 | |||
118 | /** | 278 | /** |
119 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion | 279 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion |
120 | * @cpu: cpu to stop | 280 | * @cpu: cpu to stop |
@@ -359,98 +519,14 @@ early_initcall(cpu_stop_init); | |||
359 | 519 | ||
360 | #ifdef CONFIG_STOP_MACHINE | 520 | #ifdef CONFIG_STOP_MACHINE |
361 | 521 | ||
362 | /* This controls the threads on each CPU. */ | ||
363 | enum stopmachine_state { | ||
364 | /* Dummy starting state for thread. */ | ||
365 | STOPMACHINE_NONE, | ||
366 | /* Awaiting everyone to be scheduled. */ | ||
367 | STOPMACHINE_PREPARE, | ||
368 | /* Disable interrupts. */ | ||
369 | STOPMACHINE_DISABLE_IRQ, | ||
370 | /* Run the function */ | ||
371 | STOPMACHINE_RUN, | ||
372 | /* Exit */ | ||
373 | STOPMACHINE_EXIT, | ||
374 | }; | ||
375 | |||
376 | struct stop_machine_data { | ||
377 | int (*fn)(void *); | ||
378 | void *data; | ||
379 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
380 | unsigned int num_threads; | ||
381 | const struct cpumask *active_cpus; | ||
382 | |||
383 | enum stopmachine_state state; | ||
384 | atomic_t thread_ack; | ||
385 | }; | ||
386 | |||
387 | static void set_state(struct stop_machine_data *smdata, | ||
388 | enum stopmachine_state newstate) | ||
389 | { | ||
390 | /* Reset ack counter. */ | ||
391 | atomic_set(&smdata->thread_ack, smdata->num_threads); | ||
392 | smp_wmb(); | ||
393 | smdata->state = newstate; | ||
394 | } | ||
395 | |||
396 | /* Last one to ack a state moves to the next state. */ | ||
397 | static void ack_state(struct stop_machine_data *smdata) | ||
398 | { | ||
399 | if (atomic_dec_and_test(&smdata->thread_ack)) | ||
400 | set_state(smdata, smdata->state + 1); | ||
401 | } | ||
402 | |||
403 | /* This is the cpu_stop function which stops the CPU. */ | ||
404 | static int stop_machine_cpu_stop(void *data) | ||
405 | { | ||
406 | struct stop_machine_data *smdata = data; | ||
407 | enum stopmachine_state curstate = STOPMACHINE_NONE; | ||
408 | int cpu = smp_processor_id(), err = 0; | ||
409 | unsigned long flags; | ||
410 | bool is_active; | ||
411 | |||
412 | /* | ||
413 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
414 | * already be disabled. Save the state and restore it on exit. | ||
415 | */ | ||
416 | local_save_flags(flags); | ||
417 | |||
418 | if (!smdata->active_cpus) | ||
419 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
420 | else | ||
421 | is_active = cpumask_test_cpu(cpu, smdata->active_cpus); | ||
422 | |||
423 | /* Simple state machine */ | ||
424 | do { | ||
425 | /* Chill out and ensure we re-read stopmachine_state. */ | ||
426 | cpu_relax(); | ||
427 | if (smdata->state != curstate) { | ||
428 | curstate = smdata->state; | ||
429 | switch (curstate) { | ||
430 | case STOPMACHINE_DISABLE_IRQ: | ||
431 | local_irq_disable(); | ||
432 | hard_irq_disable(); | ||
433 | break; | ||
434 | case STOPMACHINE_RUN: | ||
435 | if (is_active) | ||
436 | err = smdata->fn(smdata->data); | ||
437 | break; | ||
438 | default: | ||
439 | break; | ||
440 | } | ||
441 | ack_state(smdata); | ||
442 | } | ||
443 | } while (curstate != STOPMACHINE_EXIT); | ||
444 | |||
445 | local_irq_restore(flags); | ||
446 | return err; | ||
447 | } | ||
448 | |||
449 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 522 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
450 | { | 523 | { |
451 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 524 | struct multi_stop_data msdata = { |
452 | .num_threads = num_online_cpus(), | 525 | .fn = fn, |
453 | .active_cpus = cpus }; | 526 | .data = data, |
527 | .num_threads = num_online_cpus(), | ||
528 | .active_cpus = cpus, | ||
529 | }; | ||
454 | 530 | ||
455 | if (!stop_machine_initialized) { | 531 | if (!stop_machine_initialized) { |
456 | /* | 532 | /* |
@@ -461,7 +537,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
461 | unsigned long flags; | 537 | unsigned long flags; |
462 | int ret; | 538 | int ret; |
463 | 539 | ||
464 | WARN_ON_ONCE(smdata.num_threads != 1); | 540 | WARN_ON_ONCE(msdata.num_threads != 1); |
465 | 541 | ||
466 | local_irq_save(flags); | 542 | local_irq_save(flags); |
467 | hard_irq_disable(); | 543 | hard_irq_disable(); |
@@ -472,8 +548,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
472 | } | 548 | } |
473 | 549 | ||
474 | /* Set the initial state and stop all online cpus. */ | 550 | /* Set the initial state and stop all online cpus. */ |
475 | set_state(&smdata, STOPMACHINE_PREPARE); | 551 | set_state(&msdata, MULTI_STOP_PREPARE); |
476 | return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); | 552 | return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); |
477 | } | 553 | } |
478 | 554 | ||
479 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 555 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
@@ -513,25 +589,25 @@ EXPORT_SYMBOL_GPL(stop_machine); | |||
513 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | 589 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, |
514 | const struct cpumask *cpus) | 590 | const struct cpumask *cpus) |
515 | { | 591 | { |
516 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 592 | struct multi_stop_data msdata = { .fn = fn, .data = data, |
517 | .active_cpus = cpus }; | 593 | .active_cpus = cpus }; |
518 | struct cpu_stop_done done; | 594 | struct cpu_stop_done done; |
519 | int ret; | 595 | int ret; |
520 | 596 | ||
521 | /* Local CPU must be inactive and CPU hotplug in progress. */ | 597 | /* Local CPU must be inactive and CPU hotplug in progress. */ |
522 | BUG_ON(cpu_active(raw_smp_processor_id())); | 598 | BUG_ON(cpu_active(raw_smp_processor_id())); |
523 | smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ | 599 | msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ |
524 | 600 | ||
525 | /* No proper task established and can't sleep - busy wait for lock. */ | 601 | /* No proper task established and can't sleep - busy wait for lock. */ |
526 | while (!mutex_trylock(&stop_cpus_mutex)) | 602 | while (!mutex_trylock(&stop_cpus_mutex)) |
527 | cpu_relax(); | 603 | cpu_relax(); |
528 | 604 | ||
529 | /* Schedule work on other CPUs and execute directly for local CPU */ | 605 | /* Schedule work on other CPUs and execute directly for local CPU */ |
530 | set_state(&smdata, STOPMACHINE_PREPARE); | 606 | set_state(&msdata, MULTI_STOP_PREPARE); |
531 | cpu_stop_init_done(&done, num_active_cpus()); | 607 | cpu_stop_init_done(&done, num_active_cpus()); |
532 | queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, | 608 | queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, |
533 | &done); | 609 | &done); |
534 | ret = stop_machine_cpu_stop(&smdata); | 610 | ret = multi_cpu_stop(&msdata); |
535 | 611 | ||
536 | /* Busy wait for completion. */ | 612 | /* Busy wait for completion. */ |
537 | while (!completion_done(&done.completion)) | 613 | while (!completion_done(&done.completion)) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3f..a159e1fd2013 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = { | |||
371 | .proc_handler = proc_dointvec, | 371 | .proc_handler = proc_dointvec, |
372 | }, | 372 | }, |
373 | { | 373 | { |
374 | .procname = "numa_balancing_scan_period_reset", | ||
375 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
376 | .maxlen = sizeof(unsigned int), | ||
377 | .mode = 0644, | ||
378 | .proc_handler = proc_dointvec, | ||
379 | }, | ||
380 | { | ||
381 | .procname = "numa_balancing_scan_period_max_ms", | 374 | .procname = "numa_balancing_scan_period_max_ms", |
382 | .data = &sysctl_numa_balancing_scan_period_max, | 375 | .data = &sysctl_numa_balancing_scan_period_max, |
383 | .maxlen = sizeof(unsigned int), | 376 | .maxlen = sizeof(unsigned int), |
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = { | |||
391 | .mode = 0644, | 384 | .mode = 0644, |
392 | .proc_handler = proc_dointvec, | 385 | .proc_handler = proc_dointvec, |
393 | }, | 386 | }, |
387 | { | ||
388 | .procname = "numa_balancing_settle_count", | ||
389 | .data = &sysctl_numa_balancing_settle_count, | ||
390 | .maxlen = sizeof(unsigned int), | ||
391 | .mode = 0644, | ||
392 | .proc_handler = proc_dointvec, | ||
393 | }, | ||
394 | { | ||
395 | .procname = "numa_balancing_migrate_deferred", | ||
396 | .data = &sysctl_numa_balancing_migrate_deferred, | ||
397 | .maxlen = sizeof(unsigned int), | ||
398 | .mode = 0644, | ||
399 | .proc_handler = proc_dointvec, | ||
400 | }, | ||
394 | #endif /* CONFIG_NUMA_BALANCING */ | 401 | #endif /* CONFIG_NUMA_BALANCING */ |
395 | #endif /* CONFIG_SCHED_DEBUG */ | 402 | #endif /* CONFIG_SCHED_DEBUG */ |
396 | { | 403 | { |
diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..6582b82fa966 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) | |||
1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | 1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), |
1093 | unsigned long data) | 1093 | unsigned long data) |
1094 | { | 1094 | { |
1095 | int preempt_count = preempt_count(); | 1095 | int count = preempt_count(); |
1096 | 1096 | ||
1097 | #ifdef CONFIG_LOCKDEP | 1097 | #ifdef CONFIG_LOCKDEP |
1098 | /* | 1098 | /* |
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1119 | 1119 | ||
1120 | lock_map_release(&lockdep_map); | 1120 | lock_map_release(&lockdep_map); |
1121 | 1121 | ||
1122 | if (preempt_count != preempt_count()) { | 1122 | if (count != preempt_count()) { |
1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", | 1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", |
1124 | fn, preempt_count, preempt_count()); | 1124 | fn, count, preempt_count()); |
1125 | /* | 1125 | /* |
1126 | * Restore the preempt count. That gives us a decent | 1126 | * Restore the preempt count. That gives us a decent |
1127 | * chance to survive and extract information. If the | 1127 | * chance to survive and extract information. If the |
1128 | * callback kept a lock held, bad luck, but not worse | 1128 | * callback kept a lock held, bad luck, but not worse |
1129 | * than the BUG() we had. | 1129 | * than the BUG() we had. |
1130 | */ | 1130 | */ |
1131 | preempt_count() = preempt_count; | 1131 | preempt_count_set(count); |
1132 | } | 1132 | } |
1133 | } | 1133 | } |
1134 | 1134 | ||
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 6dc09d8f4c24..872a15a2a637 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c | |||
@@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) | |||
1002 | * Some tests (e.g. double-unlock) might corrupt the preemption | 1002 | * Some tests (e.g. double-unlock) might corrupt the preemption |
1003 | * count, so restore it: | 1003 | * count, so restore it: |
1004 | */ | 1004 | */ |
1005 | preempt_count() = saved_preempt_count; | 1005 | preempt_count_set(saved_preempt_count); |
1006 | #ifdef CONFIG_TRACE_IRQFLAGS | 1006 | #ifdef CONFIG_TRACE_IRQFLAGS |
1007 | if (softirq_count()) | 1007 | if (softirq_count()) |
1008 | current->softirqs_enabled = 0; | 1008 | current->softirqs_enabled = 0; |
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4c0d0e51d49e..04abe53f12a1 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c | |||
@@ -9,10 +9,9 @@ | |||
9 | 9 | ||
10 | notrace unsigned int debug_smp_processor_id(void) | 10 | notrace unsigned int debug_smp_processor_id(void) |
11 | { | 11 | { |
12 | unsigned long preempt_count = preempt_count(); | ||
13 | int this_cpu = raw_smp_processor_id(); | 12 | int this_cpu = raw_smp_processor_id(); |
14 | 13 | ||
15 | if (likely(preempt_count)) | 14 | if (likely(preempt_count())) |
16 | goto out; | 15 | goto out; |
17 | 16 | ||
18 | if (irqs_disabled()) | 17 | if (irqs_disabled()) |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..1be2a1f95b61 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1278,64 +1278,105 @@ out: | |||
1278 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1278 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1279 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | 1279 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) |
1280 | { | 1280 | { |
1281 | struct anon_vma *anon_vma = NULL; | ||
1281 | struct page *page; | 1282 | struct page *page; |
1282 | unsigned long haddr = addr & HPAGE_PMD_MASK; | 1283 | unsigned long haddr = addr & HPAGE_PMD_MASK; |
1283 | int target_nid; | 1284 | int page_nid = -1, this_nid = numa_node_id(); |
1284 | int current_nid = -1; | 1285 | int target_nid, last_cpupid = -1; |
1285 | bool migrated; | 1286 | bool page_locked; |
1287 | bool migrated = false; | ||
1288 | int flags = 0; | ||
1286 | 1289 | ||
1287 | spin_lock(&mm->page_table_lock); | 1290 | spin_lock(&mm->page_table_lock); |
1288 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1291 | if (unlikely(!pmd_same(pmd, *pmdp))) |
1289 | goto out_unlock; | 1292 | goto out_unlock; |
1290 | 1293 | ||
1291 | page = pmd_page(pmd); | 1294 | page = pmd_page(pmd); |
1292 | get_page(page); | 1295 | BUG_ON(is_huge_zero_page(page)); |
1293 | current_nid = page_to_nid(page); | 1296 | page_nid = page_to_nid(page); |
1297 | last_cpupid = page_cpupid_last(page); | ||
1294 | count_vm_numa_event(NUMA_HINT_FAULTS); | 1298 | count_vm_numa_event(NUMA_HINT_FAULTS); |
1295 | if (current_nid == numa_node_id()) | 1299 | if (page_nid == this_nid) { |
1296 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 1300 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
1301 | flags |= TNF_FAULT_LOCAL; | ||
1302 | } | ||
1303 | |||
1304 | /* | ||
1305 | * Avoid grouping on DSO/COW pages in specific and RO pages | ||
1306 | * in general, RO pages shouldn't hurt as much anyway since | ||
1307 | * they can be in shared cache state. | ||
1308 | */ | ||
1309 | if (!pmd_write(pmd)) | ||
1310 | flags |= TNF_NO_GROUP; | ||
1297 | 1311 | ||
1312 | /* | ||
1313 | * Acquire the page lock to serialise THP migrations but avoid dropping | ||
1314 | * page_table_lock if at all possible | ||
1315 | */ | ||
1316 | page_locked = trylock_page(page); | ||
1298 | target_nid = mpol_misplaced(page, vma, haddr); | 1317 | target_nid = mpol_misplaced(page, vma, haddr); |
1299 | if (target_nid == -1) { | 1318 | if (target_nid == -1) { |
1300 | put_page(page); | 1319 | /* If the page was locked, there are no parallel migrations */ |
1301 | goto clear_pmdnuma; | 1320 | if (page_locked) |
1321 | goto clear_pmdnuma; | ||
1322 | |||
1323 | /* | ||
1324 | * Otherwise wait for potential migrations and retry. We do | ||
1325 | * relock and check_same as the page may no longer be mapped. | ||
1326 | * As the fault is being retried, do not account for it. | ||
1327 | */ | ||
1328 | spin_unlock(&mm->page_table_lock); | ||
1329 | wait_on_page_locked(page); | ||
1330 | page_nid = -1; | ||
1331 | goto out; | ||
1302 | } | 1332 | } |
1303 | 1333 | ||
1304 | /* Acquire the page lock to serialise THP migrations */ | 1334 | /* Page is misplaced, serialise migrations and parallel THP splits */ |
1335 | get_page(page); | ||
1305 | spin_unlock(&mm->page_table_lock); | 1336 | spin_unlock(&mm->page_table_lock); |
1306 | lock_page(page); | 1337 | if (!page_locked) |
1338 | lock_page(page); | ||
1339 | anon_vma = page_lock_anon_vma_read(page); | ||
1307 | 1340 | ||
1308 | /* Confirm the PTE did not while locked */ | 1341 | /* Confirm the PMD did not change while page_table_lock was released */ |
1309 | spin_lock(&mm->page_table_lock); | 1342 | spin_lock(&mm->page_table_lock); |
1310 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1343 | if (unlikely(!pmd_same(pmd, *pmdp))) { |
1311 | unlock_page(page); | 1344 | unlock_page(page); |
1312 | put_page(page); | 1345 | put_page(page); |
1346 | page_nid = -1; | ||
1313 | goto out_unlock; | 1347 | goto out_unlock; |
1314 | } | 1348 | } |
1315 | spin_unlock(&mm->page_table_lock); | ||
1316 | 1349 | ||
1317 | /* Migrate the THP to the requested node */ | 1350 | /* |
1351 | * Migrate the THP to the requested node, returns with page unlocked | ||
1352 | * and pmd_numa cleared. | ||
1353 | */ | ||
1354 | spin_unlock(&mm->page_table_lock); | ||
1318 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1355 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
1319 | pmdp, pmd, addr, page, target_nid); | 1356 | pmdp, pmd, addr, page, target_nid); |
1320 | if (!migrated) | 1357 | if (migrated) { |
1321 | goto check_same; | 1358 | flags |= TNF_MIGRATED; |
1322 | 1359 | page_nid = target_nid; | |
1323 | task_numa_fault(target_nid, HPAGE_PMD_NR, true); | 1360 | } |
1324 | return 0; | ||
1325 | 1361 | ||
1326 | check_same: | 1362 | goto out; |
1327 | spin_lock(&mm->page_table_lock); | ||
1328 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1329 | goto out_unlock; | ||
1330 | clear_pmdnuma: | 1363 | clear_pmdnuma: |
1364 | BUG_ON(!PageLocked(page)); | ||
1331 | pmd = pmd_mknonnuma(pmd); | 1365 | pmd = pmd_mknonnuma(pmd); |
1332 | set_pmd_at(mm, haddr, pmdp, pmd); | 1366 | set_pmd_at(mm, haddr, pmdp, pmd); |
1333 | VM_BUG_ON(pmd_numa(*pmdp)); | 1367 | VM_BUG_ON(pmd_numa(*pmdp)); |
1334 | update_mmu_cache_pmd(vma, addr, pmdp); | 1368 | update_mmu_cache_pmd(vma, addr, pmdp); |
1369 | unlock_page(page); | ||
1335 | out_unlock: | 1370 | out_unlock: |
1336 | spin_unlock(&mm->page_table_lock); | 1371 | spin_unlock(&mm->page_table_lock); |
1337 | if (current_nid != -1) | 1372 | |
1338 | task_numa_fault(current_nid, HPAGE_PMD_NR, false); | 1373 | out: |
1374 | if (anon_vma) | ||
1375 | page_unlock_anon_vma_read(anon_vma); | ||
1376 | |||
1377 | if (page_nid != -1) | ||
1378 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); | ||
1379 | |||
1339 | return 0; | 1380 | return 0; |
1340 | } | 1381 | } |
1341 | 1382 | ||
@@ -1432,6 +1473,12 @@ out: | |||
1432 | return ret; | 1473 | return ret; |
1433 | } | 1474 | } |
1434 | 1475 | ||
1476 | /* | ||
1477 | * Returns | ||
1478 | * - 0 if PMD could not be locked | ||
1479 | * - 1 if PMD was locked but protections unchange and TLB flush unnecessary | ||
1480 | * - HPAGE_PMD_NR is protections changed and TLB flush necessary | ||
1481 | */ | ||
1435 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1482 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1436 | unsigned long addr, pgprot_t newprot, int prot_numa) | 1483 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1437 | { | 1484 | { |
@@ -1440,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1440 | 1487 | ||
1441 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1488 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1442 | pmd_t entry; | 1489 | pmd_t entry; |
1443 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1490 | ret = 1; |
1444 | if (!prot_numa) { | 1491 | if (!prot_numa) { |
1492 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1445 | entry = pmd_modify(entry, newprot); | 1493 | entry = pmd_modify(entry, newprot); |
1494 | ret = HPAGE_PMD_NR; | ||
1446 | BUG_ON(pmd_write(entry)); | 1495 | BUG_ON(pmd_write(entry)); |
1447 | } else { | 1496 | } else { |
1448 | struct page *page = pmd_page(*pmd); | 1497 | struct page *page = pmd_page(*pmd); |
1449 | 1498 | ||
1450 | /* only check non-shared pages */ | 1499 | /* |
1451 | if (page_mapcount(page) == 1 && | 1500 | * Do not trap faults against the zero page. The |
1501 | * read-only data is likely to be read-cached on the | ||
1502 | * local CPU cache and it is less useful to know about | ||
1503 | * local vs remote hits on the zero page. | ||
1504 | */ | ||
1505 | if (!is_huge_zero_page(page) && | ||
1452 | !pmd_numa(*pmd)) { | 1506 | !pmd_numa(*pmd)) { |
1507 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1453 | entry = pmd_mknuma(entry); | 1508 | entry = pmd_mknuma(entry); |
1509 | ret = HPAGE_PMD_NR; | ||
1454 | } | 1510 | } |
1455 | } | 1511 | } |
1456 | set_pmd_at(mm, addr, pmd, entry); | 1512 | |
1513 | /* Set PMD if cleared earlier */ | ||
1514 | if (ret == HPAGE_PMD_NR) | ||
1515 | set_pmd_at(mm, addr, pmd, entry); | ||
1516 | |||
1457 | spin_unlock(&vma->vm_mm->page_table_lock); | 1517 | spin_unlock(&vma->vm_mm->page_table_lock); |
1458 | ret = 1; | ||
1459 | } | 1518 | } |
1460 | 1519 | ||
1461 | return ret; | 1520 | return ret; |
@@ -1636,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page, | |||
1636 | page_tail->mapping = page->mapping; | 1695 | page_tail->mapping = page->mapping; |
1637 | 1696 | ||
1638 | page_tail->index = page->index + i; | 1697 | page_tail->index = page->index + i; |
1639 | page_nid_xchg_last(page_tail, page_nid_last(page)); | 1698 | page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); |
1640 | 1699 | ||
1641 | BUG_ON(!PageAnon(page_tail)); | 1700 | BUG_ON(!PageAnon(page_tail)); |
1642 | BUG_ON(!PageUptodate(page_tail)); | 1701 | BUG_ON(!PageUptodate(page_tail)); |
diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..1c7501f7fb1a 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -69,8 +69,8 @@ | |||
69 | 69 | ||
70 | #include "internal.h" | 70 | #include "internal.h" |
71 | 71 | ||
72 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | 72 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. | 73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 76 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -2719,6 +2719,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2719 | get_page(dirty_page); | 2719 | get_page(dirty_page); |
2720 | 2720 | ||
2721 | reuse: | 2721 | reuse: |
2722 | /* | ||
2723 | * Clear the pages cpupid information as the existing | ||
2724 | * information potentially belongs to a now completely | ||
2725 | * unrelated process. | ||
2726 | */ | ||
2727 | if (old_page) | ||
2728 | page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); | ||
2729 | |||
2722 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2730 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2723 | entry = pte_mkyoung(orig_pte); | 2731 | entry = pte_mkyoung(orig_pte); |
2724 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2732 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -3519,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3519 | } | 3527 | } |
3520 | 3528 | ||
3521 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3529 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
3522 | unsigned long addr, int current_nid) | 3530 | unsigned long addr, int page_nid, |
3531 | int *flags) | ||
3523 | { | 3532 | { |
3524 | get_page(page); | 3533 | get_page(page); |
3525 | 3534 | ||
3526 | count_vm_numa_event(NUMA_HINT_FAULTS); | 3535 | count_vm_numa_event(NUMA_HINT_FAULTS); |
3527 | if (current_nid == numa_node_id()) | 3536 | if (page_nid == numa_node_id()) { |
3528 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 3537 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
3538 | *flags |= TNF_FAULT_LOCAL; | ||
3539 | } | ||
3529 | 3540 | ||
3530 | return mpol_misplaced(page, vma, addr); | 3541 | return mpol_misplaced(page, vma, addr); |
3531 | } | 3542 | } |
@@ -3535,9 +3546,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3535 | { | 3546 | { |
3536 | struct page *page = NULL; | 3547 | struct page *page = NULL; |
3537 | spinlock_t *ptl; | 3548 | spinlock_t *ptl; |
3538 | int current_nid = -1; | 3549 | int page_nid = -1; |
3550 | int last_cpupid; | ||
3539 | int target_nid; | 3551 | int target_nid; |
3540 | bool migrated = false; | 3552 | bool migrated = false; |
3553 | int flags = 0; | ||
3541 | 3554 | ||
3542 | /* | 3555 | /* |
3543 | * The "pte" at this point cannot be used safely without | 3556 | * The "pte" at this point cannot be used safely without |
@@ -3564,123 +3577,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3564 | pte_unmap_unlock(ptep, ptl); | 3577 | pte_unmap_unlock(ptep, ptl); |
3565 | return 0; | 3578 | return 0; |
3566 | } | 3579 | } |
3580 | BUG_ON(is_zero_pfn(page_to_pfn(page))); | ||
3581 | |||
3582 | /* | ||
3583 | * Avoid grouping on DSO/COW pages in specific and RO pages | ||
3584 | * in general, RO pages shouldn't hurt as much anyway since | ||
3585 | * they can be in shared cache state. | ||
3586 | */ | ||
3587 | if (!pte_write(pte)) | ||
3588 | flags |= TNF_NO_GROUP; | ||
3589 | |||
3590 | /* | ||
3591 | * Flag if the page is shared between multiple address spaces. This | ||
3592 | * is later used when determining whether to group tasks together | ||
3593 | */ | ||
3594 | if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) | ||
3595 | flags |= TNF_SHARED; | ||
3567 | 3596 | ||
3568 | current_nid = page_to_nid(page); | 3597 | last_cpupid = page_cpupid_last(page); |
3569 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | 3598 | page_nid = page_to_nid(page); |
3599 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); | ||
3570 | pte_unmap_unlock(ptep, ptl); | 3600 | pte_unmap_unlock(ptep, ptl); |
3571 | if (target_nid == -1) { | 3601 | if (target_nid == -1) { |
3572 | /* | ||
3573 | * Account for the fault against the current node if it not | ||
3574 | * being replaced regardless of where the page is located. | ||
3575 | */ | ||
3576 | current_nid = numa_node_id(); | ||
3577 | put_page(page); | 3602 | put_page(page); |
3578 | goto out; | 3603 | goto out; |
3579 | } | 3604 | } |
3580 | 3605 | ||
3581 | /* Migrate to the requested node */ | 3606 | /* Migrate to the requested node */ |
3582 | migrated = migrate_misplaced_page(page, target_nid); | 3607 | migrated = migrate_misplaced_page(page, vma, target_nid); |
3583 | if (migrated) | 3608 | if (migrated) { |
3584 | current_nid = target_nid; | 3609 | page_nid = target_nid; |
3585 | 3610 | flags |= TNF_MIGRATED; | |
3586 | out: | ||
3587 | if (current_nid != -1) | ||
3588 | task_numa_fault(current_nid, 1, migrated); | ||
3589 | return 0; | ||
3590 | } | ||
3591 | |||
3592 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3593 | #ifdef CONFIG_NUMA_BALANCING | ||
3594 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3595 | unsigned long addr, pmd_t *pmdp) | ||
3596 | { | ||
3597 | pmd_t pmd; | ||
3598 | pte_t *pte, *orig_pte; | ||
3599 | unsigned long _addr = addr & PMD_MASK; | ||
3600 | unsigned long offset; | ||
3601 | spinlock_t *ptl; | ||
3602 | bool numa = false; | ||
3603 | int local_nid = numa_node_id(); | ||
3604 | |||
3605 | spin_lock(&mm->page_table_lock); | ||
3606 | pmd = *pmdp; | ||
3607 | if (pmd_numa(pmd)) { | ||
3608 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3609 | numa = true; | ||
3610 | } | ||
3611 | spin_unlock(&mm->page_table_lock); | ||
3612 | |||
3613 | if (!numa) | ||
3614 | return 0; | ||
3615 | |||
3616 | /* we're in a page fault so some vma must be in the range */ | ||
3617 | BUG_ON(!vma); | ||
3618 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3619 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3620 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3621 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3622 | pte += offset >> PAGE_SHIFT; | ||
3623 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3624 | pte_t pteval = *pte; | ||
3625 | struct page *page; | ||
3626 | int curr_nid = local_nid; | ||
3627 | int target_nid; | ||
3628 | bool migrated; | ||
3629 | if (!pte_present(pteval)) | ||
3630 | continue; | ||
3631 | if (!pte_numa(pteval)) | ||
3632 | continue; | ||
3633 | if (addr >= vma->vm_end) { | ||
3634 | vma = find_vma(mm, addr); | ||
3635 | /* there's a pte present so there must be a vma */ | ||
3636 | BUG_ON(!vma); | ||
3637 | BUG_ON(addr < vma->vm_start); | ||
3638 | } | ||
3639 | if (pte_numa(pteval)) { | ||
3640 | pteval = pte_mknonnuma(pteval); | ||
3641 | set_pte_at(mm, addr, pte, pteval); | ||
3642 | } | ||
3643 | page = vm_normal_page(vma, addr, pteval); | ||
3644 | if (unlikely(!page)) | ||
3645 | continue; | ||
3646 | /* only check non-shared pages */ | ||
3647 | if (unlikely(page_mapcount(page) != 1)) | ||
3648 | continue; | ||
3649 | |||
3650 | /* | ||
3651 | * Note that the NUMA fault is later accounted to either | ||
3652 | * the node that is currently running or where the page is | ||
3653 | * migrated to. | ||
3654 | */ | ||
3655 | curr_nid = local_nid; | ||
3656 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3657 | page_to_nid(page)); | ||
3658 | if (target_nid == -1) { | ||
3659 | put_page(page); | ||
3660 | continue; | ||
3661 | } | ||
3662 | |||
3663 | /* Migrate to the requested node */ | ||
3664 | pte_unmap_unlock(pte, ptl); | ||
3665 | migrated = migrate_misplaced_page(page, target_nid); | ||
3666 | if (migrated) | ||
3667 | curr_nid = target_nid; | ||
3668 | task_numa_fault(curr_nid, 1, migrated); | ||
3669 | |||
3670 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3671 | } | 3611 | } |
3672 | pte_unmap_unlock(orig_pte, ptl); | ||
3673 | 3612 | ||
3613 | out: | ||
3614 | if (page_nid != -1) | ||
3615 | task_numa_fault(last_cpupid, page_nid, 1, flags); | ||
3674 | return 0; | 3616 | return 0; |
3675 | } | 3617 | } |
3676 | #else | ||
3677 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3678 | unsigned long addr, pmd_t *pmdp) | ||
3679 | { | ||
3680 | BUG(); | ||
3681 | return 0; | ||
3682 | } | ||
3683 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3684 | 3618 | ||
3685 | /* | 3619 | /* |
3686 | * These routines also need to handle stuff like marking pages dirty | 3620 | * These routines also need to handle stuff like marking pages dirty |
@@ -3820,8 +3754,8 @@ retry: | |||
3820 | } | 3754 | } |
3821 | } | 3755 | } |
3822 | 3756 | ||
3823 | if (pmd_numa(*pmd)) | 3757 | /* THP should already have been handled */ |
3824 | return do_pmd_numa_page(mm, vma, address, pmd); | 3758 | BUG_ON(pmd_numa(*pmd)); |
3825 | 3759 | ||
3826 | /* | 3760 | /* |
3827 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3761 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 04729647f359..71cb253368cb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1679 | return pol; | 1679 | return pol; |
1680 | } | 1680 | } |
1681 | 1681 | ||
1682 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) | ||
1683 | { | ||
1684 | struct mempolicy *pol = get_task_policy(task); | ||
1685 | if (vma) { | ||
1686 | if (vma->vm_ops && vma->vm_ops->get_policy) { | ||
1687 | bool ret = false; | ||
1688 | |||
1689 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); | ||
1690 | if (pol && (pol->flags & MPOL_F_MOF)) | ||
1691 | ret = true; | ||
1692 | mpol_cond_put(pol); | ||
1693 | |||
1694 | return ret; | ||
1695 | } else if (vma->vm_policy) { | ||
1696 | pol = vma->vm_policy; | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | if (!pol) | ||
1701 | return default_policy.flags & MPOL_F_MOF; | ||
1702 | |||
1703 | return pol->flags & MPOL_F_MOF; | ||
1704 | } | ||
1705 | |||
1682 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | 1706 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) |
1683 | { | 1707 | { |
1684 | enum zone_type dynamic_policy_zone = policy_zone; | 1708 | enum zone_type dynamic_policy_zone = policy_zone; |
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n) | |||
2277 | kmem_cache_free(sn_cache, n); | 2301 | kmem_cache_free(sn_cache, n); |
2278 | } | 2302 | } |
2279 | 2303 | ||
2304 | #ifdef CONFIG_NUMA_BALANCING | ||
2305 | static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2306 | { | ||
2307 | /* Never defer a private fault */ | ||
2308 | if (cpupid_match_pid(p, last_cpupid)) | ||
2309 | return false; | ||
2310 | |||
2311 | if (p->numa_migrate_deferred) { | ||
2312 | p->numa_migrate_deferred--; | ||
2313 | return true; | ||
2314 | } | ||
2315 | return false; | ||
2316 | } | ||
2317 | |||
2318 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2319 | { | ||
2320 | p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; | ||
2321 | } | ||
2322 | #else | ||
2323 | static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2324 | { | ||
2325 | return false; | ||
2326 | } | ||
2327 | |||
2328 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2329 | { | ||
2330 | } | ||
2331 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2332 | |||
2280 | /** | 2333 | /** |
2281 | * mpol_misplaced - check whether current page node is valid in policy | 2334 | * mpol_misplaced - check whether current page node is valid in policy |
2282 | * | 2335 | * |
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2300 | struct zone *zone; | 2353 | struct zone *zone; |
2301 | int curnid = page_to_nid(page); | 2354 | int curnid = page_to_nid(page); |
2302 | unsigned long pgoff; | 2355 | unsigned long pgoff; |
2356 | int thiscpu = raw_smp_processor_id(); | ||
2357 | int thisnid = cpu_to_node(thiscpu); | ||
2303 | int polnid = -1; | 2358 | int polnid = -1; |
2304 | int ret = -1; | 2359 | int ret = -1; |
2305 | 2360 | ||
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2348 | 2403 | ||
2349 | /* Migrate the page towards the node whose CPU is referencing it */ | 2404 | /* Migrate the page towards the node whose CPU is referencing it */ |
2350 | if (pol->flags & MPOL_F_MORON) { | 2405 | if (pol->flags & MPOL_F_MORON) { |
2351 | int last_nid; | 2406 | int last_cpupid; |
2407 | int this_cpupid; | ||
2352 | 2408 | ||
2353 | polnid = numa_node_id(); | 2409 | polnid = thisnid; |
2410 | this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); | ||
2354 | 2411 | ||
2355 | /* | 2412 | /* |
2356 | * Multi-stage node selection is used in conjunction | 2413 | * Multi-stage node selection is used in conjunction |
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2373 | * it less likely we act on an unlikely task<->page | 2430 | * it less likely we act on an unlikely task<->page |
2374 | * relation. | 2431 | * relation. |
2375 | */ | 2432 | */ |
2376 | last_nid = page_nid_xchg_last(page, polnid); | 2433 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); |
2377 | if (last_nid != polnid) | 2434 | if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { |
2435 | |||
2436 | /* See sysctl_numa_balancing_migrate_deferred comment */ | ||
2437 | if (!cpupid_match_pid(current, last_cpupid)) | ||
2438 | defer_numa_migrate(current); | ||
2439 | |||
2440 | goto out; | ||
2441 | } | ||
2442 | |||
2443 | /* | ||
2444 | * The quadratic filter above reduces extraneous migration | ||
2445 | * of shared pages somewhat. This code reduces it even more, | ||
2446 | * reducing the overhead of page migrations of shared pages. | ||
2447 | * This makes workloads with shared pages rely more on | ||
2448 | * "move task near its memory", and less on "move memory | ||
2449 | * towards its task", which is exactly what we want. | ||
2450 | */ | ||
2451 | if (numa_migrate_deferred(current, last_cpupid)) | ||
2378 | goto out; | 2452 | goto out; |
2379 | } | 2453 | } |
2380 | 2454 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd44ccb..44c1fa9d6f54 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -443,6 +443,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
443 | */ | 443 | */ |
444 | void migrate_page_copy(struct page *newpage, struct page *page) | 444 | void migrate_page_copy(struct page *newpage, struct page *page) |
445 | { | 445 | { |
446 | int cpupid; | ||
447 | |||
446 | if (PageHuge(page) || PageTransHuge(page)) | 448 | if (PageHuge(page) || PageTransHuge(page)) |
447 | copy_huge_page(newpage, page); | 449 | copy_huge_page(newpage, page); |
448 | else | 450 | else |
@@ -479,6 +481,13 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
479 | __set_page_dirty_nobuffers(newpage); | 481 | __set_page_dirty_nobuffers(newpage); |
480 | } | 482 | } |
481 | 483 | ||
484 | /* | ||
485 | * Copy NUMA information to the new page, to prevent over-eager | ||
486 | * future migrations of this same page. | ||
487 | */ | ||
488 | cpupid = page_cpupid_xchg_last(page, -1); | ||
489 | page_cpupid_xchg_last(newpage, cpupid); | ||
490 | |||
482 | mlock_migrate_page(newpage, page); | 491 | mlock_migrate_page(newpage, page); |
483 | ksm_migrate_page(newpage, page); | 492 | ksm_migrate_page(newpage, page); |
484 | /* | 493 | /* |
@@ -1498,7 +1507,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
1498 | __GFP_NOWARN) & | 1507 | __GFP_NOWARN) & |
1499 | ~GFP_IOFS, 0); | 1508 | ~GFP_IOFS, 0); |
1500 | if (newpage) | 1509 | if (newpage) |
1501 | page_nid_xchg_last(newpage, page_nid_last(page)); | 1510 | page_cpupid_xchg_last(newpage, page_cpupid_last(page)); |
1502 | 1511 | ||
1503 | return newpage; | 1512 | return newpage; |
1504 | } | 1513 | } |
@@ -1599,7 +1608,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
1599 | * node. Caller is expected to have an elevated reference count on | 1608 | * node. Caller is expected to have an elevated reference count on |
1600 | * the page that will be dropped by this function before returning. | 1609 | * the page that will be dropped by this function before returning. |
1601 | */ | 1610 | */ |
1602 | int migrate_misplaced_page(struct page *page, int node) | 1611 | int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, |
1612 | int node) | ||
1603 | { | 1613 | { |
1604 | pg_data_t *pgdat = NODE_DATA(node); | 1614 | pg_data_t *pgdat = NODE_DATA(node); |
1605 | int isolated; | 1615 | int isolated; |
@@ -1607,10 +1617,11 @@ int migrate_misplaced_page(struct page *page, int node) | |||
1607 | LIST_HEAD(migratepages); | 1617 | LIST_HEAD(migratepages); |
1608 | 1618 | ||
1609 | /* | 1619 | /* |
1610 | * Don't migrate pages that are mapped in multiple processes. | 1620 | * Don't migrate file pages that are mapped in multiple processes |
1611 | * TODO: Handle false sharing detection instead of this hammer | 1621 | * with execute permissions as they are probably shared libraries. |
1612 | */ | 1622 | */ |
1613 | if (page_mapcount(page) != 1) | 1623 | if (page_mapcount(page) != 1 && page_is_file_cache(page) && |
1624 | (vma->vm_flags & VM_EXEC)) | ||
1614 | goto out; | 1625 | goto out; |
1615 | 1626 | ||
1616 | /* | 1627 | /* |
@@ -1661,13 +1672,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1661 | int page_lru = page_is_file_cache(page); | 1672 | int page_lru = page_is_file_cache(page); |
1662 | 1673 | ||
1663 | /* | 1674 | /* |
1664 | * Don't migrate pages that are mapped in multiple processes. | ||
1665 | * TODO: Handle false sharing detection instead of this hammer | ||
1666 | */ | ||
1667 | if (page_mapcount(page) != 1) | ||
1668 | goto out_dropref; | ||
1669 | |||
1670 | /* | ||
1671 | * Rate-limit the amount of data that is being migrated to a node. | 1675 | * Rate-limit the amount of data that is being migrated to a node. |
1672 | * Optimal placement is no good if the memory bus is saturated and | 1676 | * Optimal placement is no good if the memory bus is saturated and |
1673 | * all the time is being spent migrating! | 1677 | * all the time is being spent migrating! |
@@ -1680,7 +1684,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1680 | if (!new_page) | 1684 | if (!new_page) |
1681 | goto out_fail; | 1685 | goto out_fail; |
1682 | 1686 | ||
1683 | page_nid_xchg_last(new_page, page_nid_last(page)); | 1687 | page_cpupid_xchg_last(new_page, page_cpupid_last(page)); |
1684 | 1688 | ||
1685 | isolated = numamigrate_isolate_page(pgdat, page); | 1689 | isolated = numamigrate_isolate_page(pgdat, page); |
1686 | if (!isolated) { | 1690 | if (!isolated) { |
@@ -1713,12 +1717,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1713 | unlock_page(new_page); | 1717 | unlock_page(new_page); |
1714 | put_page(new_page); /* Free it */ | 1718 | put_page(new_page); /* Free it */ |
1715 | 1719 | ||
1716 | unlock_page(page); | 1720 | /* Retake the callers reference and putback on LRU */ |
1721 | get_page(page); | ||
1717 | putback_lru_page(page); | 1722 | putback_lru_page(page); |
1718 | 1723 | mod_zone_page_state(page_zone(page), | |
1719 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1724 | NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); |
1720 | isolated = 0; | 1725 | goto out_fail; |
1721 | goto out; | ||
1722 | } | 1726 | } |
1723 | 1727 | ||
1724 | /* | 1728 | /* |
@@ -1735,9 +1739,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1739 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1736 | entry = pmd_mkhuge(entry); | 1740 | entry = pmd_mkhuge(entry); |
1737 | 1741 | ||
1738 | page_add_new_anon_rmap(new_page, vma, haddr); | 1742 | pmdp_clear_flush(vma, haddr, pmd); |
1739 | |||
1740 | set_pmd_at(mm, haddr, pmd, entry); | 1743 | set_pmd_at(mm, haddr, pmd, entry); |
1744 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1741 | update_mmu_cache_pmd(vma, address, &entry); | 1745 | update_mmu_cache_pmd(vma, address, &entry); |
1742 | page_remove_rmap(page); | 1746 | page_remove_rmap(page); |
1743 | /* | 1747 | /* |
@@ -1756,7 +1760,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1756 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | 1760 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); |
1757 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | 1761 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); |
1758 | 1762 | ||
1759 | out: | ||
1760 | mod_zone_page_state(page_zone(page), | 1763 | mod_zone_page_state(page_zone(page), |
1761 | NR_ISOLATED_ANON + page_lru, | 1764 | NR_ISOLATED_ANON + page_lru, |
1762 | -HPAGE_PMD_NR); | 1765 | -HPAGE_PMD_NR); |
@@ -1765,6 +1768,10 @@ out: | |||
1765 | out_fail: | 1768 | out_fail: |
1766 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1769 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); |
1767 | out_dropref: | 1770 | out_dropref: |
1771 | entry = pmd_mknonnuma(entry); | ||
1772 | set_pmd_at(mm, haddr, pmd, entry); | ||
1773 | update_mmu_cache_pmd(vma, address, &entry); | ||
1774 | |||
1768 | unlock_page(page); | 1775 | unlock_page(page); |
1769 | put_page(page); | 1776 | put_page(page); |
1770 | return 0; | 1777 | return 0; |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 633c08863fd8..68562e92d50c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) | |||
71 | unsigned long or_mask, add_mask; | 71 | unsigned long or_mask, add_mask; |
72 | 72 | ||
73 | shift = 8 * sizeof(unsigned long); | 73 | shift = 8 * sizeof(unsigned long); |
74 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; | 74 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; |
75 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | 75 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", |
76 | "Section %d Node %d Zone %d Lastnid %d Flags %d\n", | 76 | "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", |
77 | SECTIONS_WIDTH, | 77 | SECTIONS_WIDTH, |
78 | NODES_WIDTH, | 78 | NODES_WIDTH, |
79 | ZONES_WIDTH, | 79 | ZONES_WIDTH, |
80 | LAST_NID_WIDTH, | 80 | LAST_CPUPID_WIDTH, |
81 | NR_PAGEFLAGS); | 81 | NR_PAGEFLAGS); |
82 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | 82 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
83 | "Section %d Node %d Zone %d Lastnid %d\n", | 83 | "Section %d Node %d Zone %d Lastcpupid %d\n", |
84 | SECTIONS_SHIFT, | 84 | SECTIONS_SHIFT, |
85 | NODES_SHIFT, | 85 | NODES_SHIFT, |
86 | ZONES_SHIFT, | 86 | ZONES_SHIFT, |
87 | LAST_NID_SHIFT); | 87 | LAST_CPUPID_SHIFT); |
88 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", | 88 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", |
89 | "Section %lu Node %lu Zone %lu Lastnid %lu\n", | 89 | "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", |
90 | (unsigned long)SECTIONS_PGSHIFT, | 90 | (unsigned long)SECTIONS_PGSHIFT, |
91 | (unsigned long)NODES_PGSHIFT, | 91 | (unsigned long)NODES_PGSHIFT, |
92 | (unsigned long)ZONES_PGSHIFT, | 92 | (unsigned long)ZONES_PGSHIFT, |
93 | (unsigned long)LAST_NID_PGSHIFT); | 93 | (unsigned long)LAST_CPUPID_PGSHIFT); |
94 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", | 94 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", |
95 | "Node/Zone ID: %lu -> %lu\n", | 95 | "Node/Zone ID: %lu -> %lu\n", |
96 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), | 96 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), |
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) | |||
102 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 102 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
103 | "Node not in page flags"); | 103 | "Node not in page flags"); |
104 | #endif | 104 | #endif |
105 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | 105 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
106 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 106 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
107 | "Last nid not in page flags"); | 107 | "Last cpupid not in page flags"); |
108 | #endif | 108 | #endif |
109 | 109 | ||
110 | if (SECTIONS_WIDTH) { | 110 | if (SECTIONS_WIDTH) { |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 2ac0afbd68f3..bf34fb8556db 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) | |||
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | } | 98 | } |
99 | 99 | ||
100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) | 100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) |
101 | int page_nid_xchg_last(struct page *page, int nid) | 101 | int page_cpupid_xchg_last(struct page *page, int cpupid) |
102 | { | 102 | { |
103 | unsigned long old_flags, flags; | 103 | unsigned long old_flags, flags; |
104 | int last_nid; | 104 | int last_cpupid; |
105 | 105 | ||
106 | do { | 106 | do { |
107 | old_flags = flags = page->flags; | 107 | old_flags = flags = page->flags; |
108 | last_nid = page_nid_last(page); | 108 | last_cpupid = page_cpupid_last(page); |
109 | 109 | ||
110 | flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | 110 | flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); |
111 | flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | 111 | flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; |
112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); | 112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); |
113 | 113 | ||
114 | return last_nid; | 114 | return last_cpupid; |
115 | } | 115 | } |
116 | #endif | 116 | #endif |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..a0302ac0be98 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
37 | 37 | ||
38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
39 | unsigned long addr, unsigned long end, pgprot_t newprot, | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
40 | int dirty_accountable, int prot_numa, bool *ret_all_same_node) | 40 | int dirty_accountable, int prot_numa) |
41 | { | 41 | { |
42 | struct mm_struct *mm = vma->vm_mm; | 42 | struct mm_struct *mm = vma->vm_mm; |
43 | pte_t *pte, oldpte; | 43 | pte_t *pte, oldpte; |
44 | spinlock_t *ptl; | 44 | spinlock_t *ptl; |
45 | unsigned long pages = 0; | 45 | unsigned long pages = 0; |
46 | bool all_same_node = true; | ||
47 | int last_nid = -1; | ||
48 | 46 | ||
49 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 47 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
50 | arch_enter_lazy_mmu_mode(); | 48 | arch_enter_lazy_mmu_mode(); |
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
63 | 61 | ||
64 | page = vm_normal_page(vma, addr, oldpte); | 62 | page = vm_normal_page(vma, addr, oldpte); |
65 | if (page) { | 63 | if (page) { |
66 | int this_nid = page_to_nid(page); | 64 | if (!pte_numa(oldpte)) { |
67 | if (last_nid == -1) | ||
68 | last_nid = this_nid; | ||
69 | if (last_nid != this_nid) | ||
70 | all_same_node = false; | ||
71 | |||
72 | /* only check non-shared pages */ | ||
73 | if (!pte_numa(oldpte) && | ||
74 | page_mapcount(page) == 1) { | ||
75 | ptent = pte_mknuma(ptent); | 65 | ptent = pte_mknuma(ptent); |
76 | updated = true; | 66 | updated = true; |
77 | } | 67 | } |
@@ -101,33 +91,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
101 | make_migration_entry_read(&entry); | 91 | make_migration_entry_read(&entry); |
102 | set_pte_at(mm, addr, pte, | 92 | set_pte_at(mm, addr, pte, |
103 | swp_entry_to_pte(entry)); | 93 | swp_entry_to_pte(entry)); |
94 | |||
95 | pages++; | ||
104 | } | 96 | } |
105 | pages++; | ||
106 | } | 97 | } |
107 | } while (pte++, addr += PAGE_SIZE, addr != end); | 98 | } while (pte++, addr += PAGE_SIZE, addr != end); |
108 | arch_leave_lazy_mmu_mode(); | 99 | arch_leave_lazy_mmu_mode(); |
109 | pte_unmap_unlock(pte - 1, ptl); | 100 | pte_unmap_unlock(pte - 1, ptl); |
110 | 101 | ||
111 | *ret_all_same_node = all_same_node; | ||
112 | return pages; | 102 | return pages; |
113 | } | 103 | } |
114 | 104 | ||
115 | #ifdef CONFIG_NUMA_BALANCING | ||
116 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
117 | pmd_t *pmd) | ||
118 | { | ||
119 | spin_lock(&mm->page_table_lock); | ||
120 | set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); | ||
121 | spin_unlock(&mm->page_table_lock); | ||
122 | } | ||
123 | #else | ||
124 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
125 | pmd_t *pmd) | ||
126 | { | ||
127 | BUG(); | ||
128 | } | ||
129 | #endif /* CONFIG_NUMA_BALANCING */ | ||
130 | |||
131 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | 105 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, |
132 | pud_t *pud, unsigned long addr, unsigned long end, | 106 | pud_t *pud, unsigned long addr, unsigned long end, |
133 | pgprot_t newprot, int dirty_accountable, int prot_numa) | 107 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
@@ -135,34 +109,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
135 | pmd_t *pmd; | 109 | pmd_t *pmd; |
136 | unsigned long next; | 110 | unsigned long next; |
137 | unsigned long pages = 0; | 111 | unsigned long pages = 0; |
138 | bool all_same_node; | ||
139 | 112 | ||
140 | pmd = pmd_offset(pud, addr); | 113 | pmd = pmd_offset(pud, addr); |
141 | do { | 114 | do { |
115 | unsigned long this_pages; | ||
116 | |||
142 | next = pmd_addr_end(addr, end); | 117 | next = pmd_addr_end(addr, end); |
143 | if (pmd_trans_huge(*pmd)) { | 118 | if (pmd_trans_huge(*pmd)) { |
144 | if (next - addr != HPAGE_PMD_SIZE) | 119 | if (next - addr != HPAGE_PMD_SIZE) |
145 | split_huge_page_pmd(vma, addr, pmd); | 120 | split_huge_page_pmd(vma, addr, pmd); |
146 | else if (change_huge_pmd(vma, pmd, addr, newprot, | 121 | else { |
147 | prot_numa)) { | 122 | int nr_ptes = change_huge_pmd(vma, pmd, addr, |
148 | pages += HPAGE_PMD_NR; | 123 | newprot, prot_numa); |
149 | continue; | 124 | |
125 | if (nr_ptes) { | ||
126 | if (nr_ptes == HPAGE_PMD_NR) | ||
127 | pages++; | ||
128 | |||
129 | continue; | ||
130 | } | ||
150 | } | 131 | } |
151 | /* fall through */ | 132 | /* fall through */ |
152 | } | 133 | } |
153 | if (pmd_none_or_clear_bad(pmd)) | 134 | if (pmd_none_or_clear_bad(pmd)) |
154 | continue; | 135 | continue; |
155 | pages += change_pte_range(vma, pmd, addr, next, newprot, | 136 | this_pages = change_pte_range(vma, pmd, addr, next, newprot, |
156 | dirty_accountable, prot_numa, &all_same_node); | 137 | dirty_accountable, prot_numa); |
157 | 138 | pages += this_pages; | |
158 | /* | ||
159 | * If we are changing protections for NUMA hinting faults then | ||
160 | * set pmd_numa if the examined pages were all on the same | ||
161 | * node. This allows a regular PMD to be handled as one fault | ||
162 | * and effectively batches the taking of the PTL | ||
163 | */ | ||
164 | if (prot_numa && all_same_node) | ||
165 | change_pmd_protnuma(vma->vm_mm, addr, pmd); | ||
166 | } while (pmd++, addr = next, addr != end); | 139 | } while (pmd++, addr = next, addr != end); |
167 | 140 | ||
168 | return pages; | 141 | return pages; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd886fac451a..73d812f16dde 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) | |||
626 | bad_page(page); | 626 | bad_page(page); |
627 | return 1; | 627 | return 1; |
628 | } | 628 | } |
629 | page_nid_reset_last(page); | 629 | page_cpupid_reset_last(page); |
630 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 630 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
631 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 631 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
632 | return 0; | 632 | return 0; |
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4015 | mminit_verify_page_links(page, zone, nid, pfn); | 4015 | mminit_verify_page_links(page, zone, nid, pfn); |
4016 | init_page_count(page); | 4016 | init_page_count(page); |
4017 | page_mapcount_reset(page); | 4017 | page_mapcount_reset(page); |
4018 | page_nid_reset_last(page); | 4018 | page_cpupid_reset_last(page); |
4019 | SetPageReserved(page); | 4019 | SetPageReserved(page); |
4020 | /* | 4020 | /* |
4021 | * Mark the block movable so that blocks are reserved for | 4021 | * Mark the block movable so that blocks are reserved for |
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 0578d4fa00a9..0f676908d15b 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c | |||
@@ -2563,9 +2563,8 @@ bed: | |||
2563 | jiffies + msecs_to_jiffies(val)); | 2563 | jiffies + msecs_to_jiffies(val)); |
2564 | 2564 | ||
2565 | /* Wait for IR-LMP to call us back */ | 2565 | /* Wait for IR-LMP to call us back */ |
2566 | __wait_event_interruptible(self->query_wait, | 2566 | err = __wait_event_interruptible(self->query_wait, |
2567 | (self->cachedaddr != 0 || self->errno == -ETIME), | 2567 | (self->cachedaddr != 0 || self->errno == -ETIME)); |
2568 | err); | ||
2569 | 2568 | ||
2570 | /* If watchdog is still activated, kill it! */ | 2569 | /* If watchdog is still activated, kill it! */ |
2571 | del_timer(&(self->watchdog)); | 2570 | del_timer(&(self->watchdog)); |
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f4484719f3e6..f63c2388f38d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c | |||
@@ -1637,12 +1637,9 @@ static int sync_thread_master(void *data) | |||
1637 | continue; | 1637 | continue; |
1638 | } | 1638 | } |
1639 | while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { | 1639 | while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { |
1640 | int ret = 0; | 1640 | int ret = __wait_event_interruptible(*sk_sleep(sk), |
1641 | |||
1642 | __wait_event_interruptible(*sk_sleep(sk), | ||
1643 | sock_writeable(sk) || | 1641 | sock_writeable(sk) || |
1644 | kthread_should_stop(), | 1642 | kthread_should_stop()); |
1645 | ret); | ||
1646 | if (unlikely(kthread_should_stop())) | 1643 | if (unlikely(kthread_should_stop())) |
1647 | goto done; | 1644 | goto done; |
1648 | } | 1645 | } |