diff options
-rw-r--r-- | Documentation/sysctl/kernel.txt | 10 | ||||
-rw-r--r-- | arch/arm/kernel/process.c | 16 | ||||
-rw-r--r-- | arch/powerpc/platforms/cell/spufs/sched.c | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/setup.c | 34 | ||||
-rw-r--r-- | arch/sh/kernel/idle.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 5 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-pseries.c | 6 | ||||
-rw-r--r-- | include/linux/sched.h | 41 | ||||
-rw-r--r-- | include/linux/sched/prio.h | 40 | ||||
-rw-r--r-- | include/linux/sched/rt.h | 19 | ||||
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/cpu/Makefile | 1 | ||||
-rw-r--r-- | kernel/cpu/idle.c | 7 | ||||
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 79 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 4 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 22 | ||||
-rw-r--r-- | kernel/sched/debug.c | 7 | ||||
-rw-r--r-- | kernel/sched/fair.c | 503 | ||||
-rw-r--r-- | kernel/sched/idle.c | 144 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 27 | ||||
-rw-r--r-- | kernel/sched/rt.c | 43 | ||||
-rw-r--r-- | kernel/sched/sched.h | 29 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 16 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | mm/mempolicy.c | 74 |
26 files changed, 746 insertions, 396 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index e55124e7c40c..04bf16ad8561 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -441,8 +441,7 @@ feature should be disabled. Otherwise, if the system overhead from the | |||
441 | feature is too high then the rate the kernel samples for NUMA hinting | 441 | feature is too high then the rate the kernel samples for NUMA hinting |
442 | faults may be controlled by the numa_balancing_scan_period_min_ms, | 442 | faults may be controlled by the numa_balancing_scan_period_min_ms, |
443 | numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, | 443 | numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, |
444 | numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and | 444 | numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls. |
445 | numa_balancing_migrate_deferred. | ||
446 | 445 | ||
447 | ============================================================== | 446 | ============================================================== |
448 | 447 | ||
@@ -483,13 +482,6 @@ rate for each task. | |||
483 | numa_balancing_scan_size_mb is how many megabytes worth of pages are | 482 | numa_balancing_scan_size_mb is how many megabytes worth of pages are |
484 | scanned for a given scan. | 483 | scanned for a given scan. |
485 | 484 | ||
486 | numa_balancing_migrate_deferred is how many page migrations get skipped | ||
487 | unconditionally, after a page migration is skipped because a page is shared | ||
488 | with other tasks. This reduces page migration overhead, and determines | ||
489 | how much stronger the "move task near its memory" policy scheduler becomes, | ||
490 | versus the "move memory near its task" memory management policy, for workloads | ||
491 | with shared memory. | ||
492 | |||
493 | ============================================================== | 485 | ============================================================== |
494 | 486 | ||
495 | osrelease, ostype & version: | 487 | osrelease, ostype & version: |
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 92f7b15dd221..adabeababeb0 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/uaccess.h> | 30 | #include <linux/uaccess.h> |
31 | #include <linux/random.h> | 31 | #include <linux/random.h> |
32 | #include <linux/hw_breakpoint.h> | 32 | #include <linux/hw_breakpoint.h> |
33 | #include <linux/cpuidle.h> | ||
34 | #include <linux/leds.h> | 33 | #include <linux/leds.h> |
35 | #include <linux/reboot.h> | 34 | #include <linux/reboot.h> |
36 | 35 | ||
@@ -133,7 +132,11 @@ EXPORT_SYMBOL_GPL(arm_pm_restart); | |||
133 | 132 | ||
134 | void (*arm_pm_idle)(void); | 133 | void (*arm_pm_idle)(void); |
135 | 134 | ||
136 | static void default_idle(void) | 135 | /* |
136 | * Called from the core idle loop. | ||
137 | */ | ||
138 | |||
139 | void arch_cpu_idle(void) | ||
137 | { | 140 | { |
138 | if (arm_pm_idle) | 141 | if (arm_pm_idle) |
139 | arm_pm_idle(); | 142 | arm_pm_idle(); |
@@ -168,15 +171,6 @@ void arch_cpu_idle_dead(void) | |||
168 | #endif | 171 | #endif |
169 | 172 | ||
170 | /* | 173 | /* |
171 | * Called from the core idle loop. | ||
172 | */ | ||
173 | void arch_cpu_idle(void) | ||
174 | { | ||
175 | if (cpuidle_idle_call()) | ||
176 | default_idle(); | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Called by kexec, immediately prior to machine_kexec(). | 174 | * Called by kexec, immediately prior to machine_kexec(). |
181 | * | 175 | * |
182 | * This must completely disable all secondary CPUs; simply causing those CPUs | 176 | * This must completely disable all secondary CPUs; simply causing those CPUs |
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 49318385d4fa..4a0a64fe25df 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c | |||
@@ -83,7 +83,6 @@ static struct timer_list spuloadavg_timer; | |||
83 | #define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1) | 83 | #define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1) |
84 | #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) | 84 | #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) |
85 | 85 | ||
86 | #define MAX_USER_PRIO (MAX_PRIO - MAX_RT_PRIO) | ||
87 | #define SCALE_PRIO(x, prio) \ | 86 | #define SCALE_PRIO(x, prio) \ |
88 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) | 87 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) |
89 | 88 | ||
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 972df0ffd4dc..2db8cc691bf4 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/irq.h> | 39 | #include <linux/irq.h> |
40 | #include <linux/seq_file.h> | 40 | #include <linux/seq_file.h> |
41 | #include <linux/root_dev.h> | 41 | #include <linux/root_dev.h> |
42 | #include <linux/cpuidle.h> | ||
43 | #include <linux/of.h> | 42 | #include <linux/of.h> |
44 | #include <linux/kexec.h> | 43 | #include <linux/kexec.h> |
45 | 44 | ||
@@ -356,29 +355,24 @@ early_initcall(alloc_dispatch_log_kmem_cache); | |||
356 | 355 | ||
357 | static void pseries_lpar_idle(void) | 356 | static void pseries_lpar_idle(void) |
358 | { | 357 | { |
359 | /* This would call on the cpuidle framework, and the back-end pseries | 358 | /* |
360 | * driver to go to idle states | 359 | * Default handler to go into low thread priority and possibly |
360 | * low power mode by cedeing processor to hypervisor | ||
361 | */ | 361 | */ |
362 | if (cpuidle_idle_call()) { | ||
363 | /* On error, execute default handler | ||
364 | * to go into low thread priority and possibly | ||
365 | * low power mode by cedeing processor to hypervisor | ||
366 | */ | ||
367 | 362 | ||
368 | /* Indicate to hypervisor that we are idle. */ | 363 | /* Indicate to hypervisor that we are idle. */ |
369 | get_lppaca()->idle = 1; | 364 | get_lppaca()->idle = 1; |
370 | 365 | ||
371 | /* | 366 | /* |
372 | * Yield the processor to the hypervisor. We return if | 367 | * Yield the processor to the hypervisor. We return if |
373 | * an external interrupt occurs (which are driven prior | 368 | * an external interrupt occurs (which are driven prior |
374 | * to returning here) or if a prod occurs from another | 369 | * to returning here) or if a prod occurs from another |
375 | * processor. When returning here, external interrupts | 370 | * processor. When returning here, external interrupts |
376 | * are enabled. | 371 | * are enabled. |
377 | */ | 372 | */ |
378 | cede_processor(); | 373 | cede_processor(); |
379 | 374 | ||
380 | get_lppaca()->idle = 0; | 375 | get_lppaca()->idle = 0; |
381 | } | ||
382 | } | 376 | } |
383 | 377 | ||
384 | /* | 378 | /* |
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 2ea4483fd722..be616ee0cf87 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/thread_info.h> | 16 | #include <linux/thread_info.h> |
17 | #include <linux/irqflags.h> | 17 | #include <linux/irqflags.h> |
18 | #include <linux/smp.h> | 18 | #include <linux/smp.h> |
19 | #include <linux/cpuidle.h> | ||
20 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
21 | #include <asm/pgalloc.h> | 20 | #include <asm/pgalloc.h> |
22 | #include <asm/smp.h> | 21 | #include <asm/smp.h> |
@@ -40,8 +39,7 @@ void arch_cpu_idle_dead(void) | |||
40 | 39 | ||
41 | void arch_cpu_idle(void) | 40 | void arch_cpu_idle(void) |
42 | { | 41 | { |
43 | if (cpuidle_idle_call()) | 42 | sh_idle(); |
44 | sh_idle(); | ||
45 | } | 43 | } |
46 | 44 | ||
47 | void __init select_idle_routine(void) | 45 | void __init select_idle_routine(void) |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3fb8d95ab8b5..4505e2a950d8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void) | |||
298 | */ | 298 | */ |
299 | void arch_cpu_idle(void) | 299 | void arch_cpu_idle(void) |
300 | { | 300 | { |
301 | if (cpuidle_idle_call()) | 301 | x86_idle(); |
302 | x86_idle(); | ||
303 | else | ||
304 | local_irq_enable(); | ||
305 | } | 302 | } |
306 | 303 | ||
307 | /* | 304 | /* |
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 7ab564aa0b1c..6f7b01956885 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <asm/reg.h> | 17 | #include <asm/reg.h> |
18 | #include <asm/machdep.h> | 18 | #include <asm/machdep.h> |
19 | #include <asm/firmware.h> | 19 | #include <asm/firmware.h> |
20 | #include <asm/runlatch.h> | ||
20 | #include <asm/plpar_wrappers.h> | 21 | #include <asm/plpar_wrappers.h> |
21 | 22 | ||
22 | struct cpuidle_driver pseries_idle_driver = { | 23 | struct cpuidle_driver pseries_idle_driver = { |
@@ -29,6 +30,7 @@ static struct cpuidle_state *cpuidle_state_table; | |||
29 | 30 | ||
30 | static inline void idle_loop_prolog(unsigned long *in_purr) | 31 | static inline void idle_loop_prolog(unsigned long *in_purr) |
31 | { | 32 | { |
33 | ppc64_runlatch_off(); | ||
32 | *in_purr = mfspr(SPRN_PURR); | 34 | *in_purr = mfspr(SPRN_PURR); |
33 | /* | 35 | /* |
34 | * Indicate to the HV that we are idle. Now would be | 36 | * Indicate to the HV that we are idle. Now would be |
@@ -45,6 +47,10 @@ static inline void idle_loop_epilog(unsigned long in_purr) | |||
45 | wait_cycles += mfspr(SPRN_PURR) - in_purr; | 47 | wait_cycles += mfspr(SPRN_PURR) - in_purr; |
46 | get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); | 48 | get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); |
47 | get_lppaca()->idle = 0; | 49 | get_lppaca()->idle = 0; |
50 | |||
51 | if (irqs_disabled()) | ||
52 | local_irq_enable(); | ||
53 | ppc64_runlatch_on(); | ||
48 | } | 54 | } |
49 | 55 | ||
50 | static int snooze_loop(struct cpuidle_device *dev, | 56 | static int snooze_loop(struct cpuidle_device *dev, |
diff --git a/include/linux/sched.h b/include/linux/sched.h index a781dec1cd0b..c49a2585ff7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -3,6 +3,8 @@ | |||
3 | 3 | ||
4 | #include <uapi/linux/sched.h> | 4 | #include <uapi/linux/sched.h> |
5 | 5 | ||
6 | #include <linux/sched/prio.h> | ||
7 | |||
6 | 8 | ||
7 | struct sched_param { | 9 | struct sched_param { |
8 | int sched_priority; | 10 | int sched_priority; |
@@ -1077,6 +1079,7 @@ struct sched_entity { | |||
1077 | #endif | 1079 | #endif |
1078 | 1080 | ||
1079 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1081 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1082 | int depth; | ||
1080 | struct sched_entity *parent; | 1083 | struct sched_entity *parent; |
1081 | /* rq on which this entity is (to be) queued: */ | 1084 | /* rq on which this entity is (to be) queued: */ |
1082 | struct cfs_rq *cfs_rq; | 1085 | struct cfs_rq *cfs_rq; |
@@ -1470,9 +1473,10 @@ struct task_struct { | |||
1470 | unsigned int numa_scan_period; | 1473 | unsigned int numa_scan_period; |
1471 | unsigned int numa_scan_period_max; | 1474 | unsigned int numa_scan_period_max; |
1472 | int numa_preferred_nid; | 1475 | int numa_preferred_nid; |
1473 | int numa_migrate_deferred; | ||
1474 | unsigned long numa_migrate_retry; | 1476 | unsigned long numa_migrate_retry; |
1475 | u64 node_stamp; /* migration stamp */ | 1477 | u64 node_stamp; /* migration stamp */ |
1478 | u64 last_task_numa_placement; | ||
1479 | u64 last_sum_exec_runtime; | ||
1476 | struct callback_head numa_work; | 1480 | struct callback_head numa_work; |
1477 | 1481 | ||
1478 | struct list_head numa_entry; | 1482 | struct list_head numa_entry; |
@@ -1483,15 +1487,22 @@ struct task_struct { | |||
1483 | * Scheduling placement decisions are made based on the these counts. | 1487 | * Scheduling placement decisions are made based on the these counts. |
1484 | * The values remain static for the duration of a PTE scan | 1488 | * The values remain static for the duration of a PTE scan |
1485 | */ | 1489 | */ |
1486 | unsigned long *numa_faults; | 1490 | unsigned long *numa_faults_memory; |
1487 | unsigned long total_numa_faults; | 1491 | unsigned long total_numa_faults; |
1488 | 1492 | ||
1489 | /* | 1493 | /* |
1490 | * numa_faults_buffer records faults per node during the current | 1494 | * numa_faults_buffer records faults per node during the current |
1491 | * scan window. When the scan completes, the counts in numa_faults | 1495 | * scan window. When the scan completes, the counts in |
1492 | * decay and these values are copied. | 1496 | * numa_faults_memory decay and these values are copied. |
1497 | */ | ||
1498 | unsigned long *numa_faults_buffer_memory; | ||
1499 | |||
1500 | /* | ||
1501 | * Track the nodes the process was running on when a NUMA hinting | ||
1502 | * fault was incurred. | ||
1493 | */ | 1503 | */ |
1494 | unsigned long *numa_faults_buffer; | 1504 | unsigned long *numa_faults_cpu; |
1505 | unsigned long *numa_faults_buffer_cpu; | ||
1495 | 1506 | ||
1496 | /* | 1507 | /* |
1497 | * numa_faults_locality tracks if faults recorded during the last | 1508 | * numa_faults_locality tracks if faults recorded during the last |
@@ -1596,8 +1607,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); | |||
1596 | extern pid_t task_numa_group_id(struct task_struct *p); | 1607 | extern pid_t task_numa_group_id(struct task_struct *p); |
1597 | extern void set_numabalancing_state(bool enabled); | 1608 | extern void set_numabalancing_state(bool enabled); |
1598 | extern void task_numa_free(struct task_struct *p); | 1609 | extern void task_numa_free(struct task_struct *p); |
1599 | 1610 | extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, | |
1600 | extern unsigned int sysctl_numa_balancing_migrate_deferred; | 1611 | int src_nid, int dst_cpu); |
1601 | #else | 1612 | #else |
1602 | static inline void task_numa_fault(int last_node, int node, int pages, | 1613 | static inline void task_numa_fault(int last_node, int node, int pages, |
1603 | int flags) | 1614 | int flags) |
@@ -1613,6 +1624,11 @@ static inline void set_numabalancing_state(bool enabled) | |||
1613 | static inline void task_numa_free(struct task_struct *p) | 1624 | static inline void task_numa_free(struct task_struct *p) |
1614 | { | 1625 | { |
1615 | } | 1626 | } |
1627 | static inline bool should_numa_migrate_memory(struct task_struct *p, | ||
1628 | struct page *page, int src_nid, int dst_cpu) | ||
1629 | { | ||
1630 | return true; | ||
1631 | } | ||
1616 | #endif | 1632 | #endif |
1617 | 1633 | ||
1618 | static inline struct pid *task_pid(struct task_struct *task) | 1634 | static inline struct pid *task_pid(struct task_struct *task) |
@@ -2080,7 +2096,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { } | |||
2080 | extern bool yield_to(struct task_struct *p, bool preempt); | 2096 | extern bool yield_to(struct task_struct *p, bool preempt); |
2081 | extern void set_user_nice(struct task_struct *p, long nice); | 2097 | extern void set_user_nice(struct task_struct *p, long nice); |
2082 | extern int task_prio(const struct task_struct *p); | 2098 | extern int task_prio(const struct task_struct *p); |
2083 | extern int task_nice(const struct task_struct *p); | 2099 | /** |
2100 | * task_nice - return the nice value of a given task. | ||
2101 | * @p: the task in question. | ||
2102 | * | ||
2103 | * Return: The nice value [ -20 ... 0 ... 19 ]. | ||
2104 | */ | ||
2105 | static inline int task_nice(const struct task_struct *p) | ||
2106 | { | ||
2107 | return PRIO_TO_NICE((p)->static_prio); | ||
2108 | } | ||
2084 | extern int can_nice(const struct task_struct *p, const int nice); | 2109 | extern int can_nice(const struct task_struct *p, const int nice); |
2085 | extern int task_curr(const struct task_struct *p); | 2110 | extern int task_curr(const struct task_struct *p); |
2086 | extern int idle_cpu(int cpu); | 2111 | extern int idle_cpu(int cpu); |
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h new file mode 100644 index 000000000000..410ccb74c9e6 --- /dev/null +++ b/include/linux/sched/prio.h | |||
@@ -0,0 +1,40 @@ | |||
1 | #ifndef _SCHED_PRIO_H | ||
2 | #define _SCHED_PRIO_H | ||
3 | |||
4 | /* | ||
5 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT | ||
6 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH | ||
7 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority | ||
8 | * values are inverted: lower p->prio value means higher priority. | ||
9 | * | ||
10 | * The MAX_USER_RT_PRIO value allows the actual maximum | ||
11 | * RT priority to be separate from the value exported to | ||
12 | * user-space. This allows kernel threads to set their | ||
13 | * priority to a value higher than any user task. Note: | ||
14 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. | ||
15 | */ | ||
16 | |||
17 | #define MAX_USER_RT_PRIO 100 | ||
18 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | ||
19 | |||
20 | #define MAX_PRIO (MAX_RT_PRIO + 40) | ||
21 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) | ||
22 | |||
23 | /* | ||
24 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
25 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
26 | * and back. | ||
27 | */ | ||
28 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
29 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
30 | |||
31 | /* | ||
32 | * 'User priority' is the nice value converted to something we | ||
33 | * can work with better when scaling various scheduler parameters, | ||
34 | * it's a [ 0 ... 39 ] range. | ||
35 | */ | ||
36 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
37 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
38 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
39 | |||
40 | #endif /* _SCHED_PRIO_H */ | ||
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 34e4ebea8fce..f7453d4c5613 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h | |||
@@ -1,24 +1,7 @@ | |||
1 | #ifndef _SCHED_RT_H | 1 | #ifndef _SCHED_RT_H |
2 | #define _SCHED_RT_H | 2 | #define _SCHED_RT_H |
3 | 3 | ||
4 | /* | 4 | #include <linux/sched/prio.h> |
5 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT | ||
6 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH | ||
7 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority | ||
8 | * values are inverted: lower p->prio value means higher priority. | ||
9 | * | ||
10 | * The MAX_USER_RT_PRIO value allows the actual maximum | ||
11 | * RT priority to be separate from the value exported to | ||
12 | * user-space. This allows kernel threads to set their | ||
13 | * priority to a value higher than any user task. Note: | ||
14 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. | ||
15 | */ | ||
16 | |||
17 | #define MAX_USER_RT_PRIO 100 | ||
18 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | ||
19 | |||
20 | #define MAX_PRIO (MAX_RT_PRIO + 40) | ||
21 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) | ||
22 | 5 | ||
23 | static inline int rt_prio(int prio) | 6 | static inline int rt_prio(int prio) |
24 | { | 7 | { |
diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b6..6f1c7e5cfca1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -22,7 +22,6 @@ obj-y += sched/ | |||
22 | obj-y += locking/ | 22 | obj-y += locking/ |
23 | obj-y += power/ | 23 | obj-y += power/ |
24 | obj-y += printk/ | 24 | obj-y += printk/ |
25 | obj-y += cpu/ | ||
26 | obj-y += irq/ | 25 | obj-y += irq/ |
27 | obj-y += rcu/ | 26 | obj-y += rcu/ |
28 | 27 | ||
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile deleted file mode 100644 index 59ab052ef7a0..000000000000 --- a/kernel/cpu/Makefile +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | obj-y = idle.o | ||
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 277f494c2a9a..b7976a127178 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -3,6 +3,7 @@ | |||
3 | */ | 3 | */ |
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/cpu.h> | 5 | #include <linux/cpu.h> |
6 | #include <linux/cpuidle.h> | ||
6 | #include <linux/tick.h> | 7 | #include <linux/tick.h> |
7 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
8 | #include <linux/stackprotector.h> | 9 | #include <linux/stackprotector.h> |
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void) | |||
95 | if (!current_clr_polling_and_test()) { | 96 | if (!current_clr_polling_and_test()) { |
96 | stop_critical_timings(); | 97 | stop_critical_timings(); |
97 | rcu_idle_enter(); | 98 | rcu_idle_enter(); |
98 | arch_cpu_idle(); | 99 | if (cpuidle_idle_call()) |
99 | WARN_ON_ONCE(irqs_disabled()); | 100 | arch_cpu_idle(); |
101 | if (WARN_ON_ONCE(irqs_disabled())) | ||
102 | local_irq_enable(); | ||
100 | rcu_idle_exit(); | 103 | rcu_idle_exit(); |
101 | start_critical_timings(); | 104 | start_critical_timings(); |
102 | } else { | 105 | } else { |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a95c8c2af2a..ab32b7b0db5c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -13,7 +13,7 @@ endif | |||
13 | 13 | ||
14 | obj-y += core.o proc.o clock.o cputime.o | 14 | obj-y += core.o proc.o clock.o cputime.o |
15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
16 | obj-y += wait.o completion.o | 16 | obj-y += wait.o completion.o idle.o |
17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
19 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..fb9764fbc537 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1745 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1745 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
1746 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1746 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
1747 | p->numa_work.next = &p->numa_work; | 1747 | p->numa_work.next = &p->numa_work; |
1748 | p->numa_faults = NULL; | 1748 | p->numa_faults_memory = NULL; |
1749 | p->numa_faults_buffer = NULL; | 1749 | p->numa_faults_buffer_memory = NULL; |
1750 | p->last_task_numa_placement = 0; | ||
1751 | p->last_sum_exec_runtime = 0; | ||
1750 | 1752 | ||
1751 | INIT_LIST_HEAD(&p->numa_entry); | 1753 | INIT_LIST_HEAD(&p->numa_entry); |
1752 | p->numa_group = NULL; | 1754 | p->numa_group = NULL; |
@@ -2167,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2167 | 2169 | ||
2168 | #ifdef CONFIG_SMP | 2170 | #ifdef CONFIG_SMP |
2169 | 2171 | ||
2170 | /* assumes rq->lock is held */ | ||
2171 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2172 | { | ||
2173 | if (prev->sched_class->pre_schedule) | ||
2174 | prev->sched_class->pre_schedule(rq, prev); | ||
2175 | } | ||
2176 | |||
2177 | /* rq->lock is NOT held, but preemption is disabled */ | 2172 | /* rq->lock is NOT held, but preemption is disabled */ |
2178 | static inline void post_schedule(struct rq *rq) | 2173 | static inline void post_schedule(struct rq *rq) |
2179 | { | 2174 | { |
@@ -2191,10 +2186,6 @@ static inline void post_schedule(struct rq *rq) | |||
2191 | 2186 | ||
2192 | #else | 2187 | #else |
2193 | 2188 | ||
2194 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2195 | { | ||
2196 | } | ||
2197 | |||
2198 | static inline void post_schedule(struct rq *rq) | 2189 | static inline void post_schedule(struct rq *rq) |
2199 | { | 2190 | { |
2200 | } | 2191 | } |
@@ -2577,18 +2568,11 @@ static inline void schedule_debug(struct task_struct *prev) | |||
2577 | schedstat_inc(this_rq(), sched_count); | 2568 | schedstat_inc(this_rq(), sched_count); |
2578 | } | 2569 | } |
2579 | 2570 | ||
2580 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | ||
2581 | { | ||
2582 | if (prev->on_rq || rq->skip_clock_update < 0) | ||
2583 | update_rq_clock(rq); | ||
2584 | prev->sched_class->put_prev_task(rq, prev); | ||
2585 | } | ||
2586 | |||
2587 | /* | 2571 | /* |
2588 | * Pick up the highest-prio task: | 2572 | * Pick up the highest-prio task: |
2589 | */ | 2573 | */ |
2590 | static inline struct task_struct * | 2574 | static inline struct task_struct * |
2591 | pick_next_task(struct rq *rq) | 2575 | pick_next_task(struct rq *rq, struct task_struct *prev) |
2592 | { | 2576 | { |
2593 | const struct sched_class *class; | 2577 | const struct sched_class *class; |
2594 | struct task_struct *p; | 2578 | struct task_struct *p; |
@@ -2597,14 +2581,15 @@ pick_next_task(struct rq *rq) | |||
2597 | * Optimization: we know that if all tasks are in | 2581 | * Optimization: we know that if all tasks are in |
2598 | * the fair class we can call that function directly: | 2582 | * the fair class we can call that function directly: |
2599 | */ | 2583 | */ |
2600 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { | 2584 | if (likely(prev->sched_class == &fair_sched_class && |
2601 | p = fair_sched_class.pick_next_task(rq); | 2585 | rq->nr_running == rq->cfs.h_nr_running)) { |
2586 | p = fair_sched_class.pick_next_task(rq, prev); | ||
2602 | if (likely(p)) | 2587 | if (likely(p)) |
2603 | return p; | 2588 | return p; |
2604 | } | 2589 | } |
2605 | 2590 | ||
2606 | for_each_class(class) { | 2591 | for_each_class(class) { |
2607 | p = class->pick_next_task(rq); | 2592 | p = class->pick_next_task(rq, prev); |
2608 | if (p) | 2593 | if (p) |
2609 | return p; | 2594 | return p; |
2610 | } | 2595 | } |
@@ -2700,13 +2685,10 @@ need_resched: | |||
2700 | switch_count = &prev->nvcsw; | 2685 | switch_count = &prev->nvcsw; |
2701 | } | 2686 | } |
2702 | 2687 | ||
2703 | pre_schedule(rq, prev); | 2688 | if (prev->on_rq || rq->skip_clock_update < 0) |
2704 | 2689 | update_rq_clock(rq); | |
2705 | if (unlikely(!rq->nr_running)) | ||
2706 | idle_balance(cpu, rq); | ||
2707 | 2690 | ||
2708 | put_prev_task(rq, prev); | 2691 | next = pick_next_task(rq, prev); |
2709 | next = pick_next_task(rq); | ||
2710 | clear_tsk_need_resched(prev); | 2692 | clear_tsk_need_resched(prev); |
2711 | clear_preempt_need_resched(); | 2693 | clear_preempt_need_resched(); |
2712 | rq->skip_clock_update = 0; | 2694 | rq->skip_clock_update = 0; |
@@ -2998,7 +2980,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
2998 | unsigned long flags; | 2980 | unsigned long flags; |
2999 | struct rq *rq; | 2981 | struct rq *rq; |
3000 | 2982 | ||
3001 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 2983 | if (task_nice(p) == nice || nice < -20 || nice > 19) |
3002 | return; | 2984 | return; |
3003 | /* | 2985 | /* |
3004 | * We have to be careful, if called from sys_setpriority(), | 2986 | * We have to be careful, if called from sys_setpriority(), |
@@ -3076,7 +3058,7 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
3076 | if (increment > 40) | 3058 | if (increment > 40) |
3077 | increment = 40; | 3059 | increment = 40; |
3078 | 3060 | ||
3079 | nice = TASK_NICE(current) + increment; | 3061 | nice = task_nice(current) + increment; |
3080 | if (nice < -20) | 3062 | if (nice < -20) |
3081 | nice = -20; | 3063 | nice = -20; |
3082 | if (nice > 19) | 3064 | if (nice > 19) |
@@ -3109,18 +3091,6 @@ int task_prio(const struct task_struct *p) | |||
3109 | } | 3091 | } |
3110 | 3092 | ||
3111 | /** | 3093 | /** |
3112 | * task_nice - return the nice value of a given task. | ||
3113 | * @p: the task in question. | ||
3114 | * | ||
3115 | * Return: The nice value [ -20 ... 0 ... 19 ]. | ||
3116 | */ | ||
3117 | int task_nice(const struct task_struct *p) | ||
3118 | { | ||
3119 | return TASK_NICE(p); | ||
3120 | } | ||
3121 | EXPORT_SYMBOL(task_nice); | ||
3122 | |||
3123 | /** | ||
3124 | * idle_cpu - is a given cpu idle currently? | 3094 | * idle_cpu - is a given cpu idle currently? |
3125 | * @cpu: the processor in question. | 3095 | * @cpu: the processor in question. |
3126 | * | 3096 | * |
@@ -3319,7 +3289,7 @@ recheck: | |||
3319 | */ | 3289 | */ |
3320 | if (user && !capable(CAP_SYS_NICE)) { | 3290 | if (user && !capable(CAP_SYS_NICE)) { |
3321 | if (fair_policy(policy)) { | 3291 | if (fair_policy(policy)) { |
3322 | if (attr->sched_nice < TASK_NICE(p) && | 3292 | if (attr->sched_nice < task_nice(p) && |
3323 | !can_nice(p, attr->sched_nice)) | 3293 | !can_nice(p, attr->sched_nice)) |
3324 | return -EPERM; | 3294 | return -EPERM; |
3325 | } | 3295 | } |
@@ -3343,7 +3313,7 @@ recheck: | |||
3343 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3313 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
3344 | */ | 3314 | */ |
3345 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { | 3315 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
3346 | if (!can_nice(p, TASK_NICE(p))) | 3316 | if (!can_nice(p, task_nice(p))) |
3347 | return -EPERM; | 3317 | return -EPERM; |
3348 | } | 3318 | } |
3349 | 3319 | ||
@@ -3383,7 +3353,7 @@ recheck: | |||
3383 | * If not changing anything there's no need to proceed further: | 3353 | * If not changing anything there's no need to proceed further: |
3384 | */ | 3354 | */ |
3385 | if (unlikely(policy == p->policy)) { | 3355 | if (unlikely(policy == p->policy)) { |
3386 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) | 3356 | if (fair_policy(policy) && attr->sched_nice != task_nice(p)) |
3387 | goto change; | 3357 | goto change; |
3388 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | 3358 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) |
3389 | goto change; | 3359 | goto change; |
@@ -3835,7 +3805,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
3835 | else if (task_has_rt_policy(p)) | 3805 | else if (task_has_rt_policy(p)) |
3836 | attr.sched_priority = p->rt_priority; | 3806 | attr.sched_priority = p->rt_priority; |
3837 | else | 3807 | else |
3838 | attr.sched_nice = TASK_NICE(p); | 3808 | attr.sched_nice = task_nice(p); |
3839 | 3809 | ||
3840 | rcu_read_unlock(); | 3810 | rcu_read_unlock(); |
3841 | 3811 | ||
@@ -4751,7 +4721,7 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
4751 | if (rq->nr_running == 1) | 4721 | if (rq->nr_running == 1) |
4752 | break; | 4722 | break; |
4753 | 4723 | ||
4754 | next = pick_next_task(rq); | 4724 | next = pick_next_task(rq, NULL); |
4755 | BUG_ON(!next); | 4725 | BUG_ON(!next); |
4756 | next->sched_class->put_prev_task(rq, next); | 4726 | next->sched_class->put_prev_task(rq, next); |
4757 | 4727 | ||
@@ -4841,7 +4811,7 @@ set_table_entry(struct ctl_table *entry, | |||
4841 | static struct ctl_table * | 4811 | static struct ctl_table * |
4842 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 4812 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
4843 | { | 4813 | { |
4844 | struct ctl_table *table = sd_alloc_ctl_entry(13); | 4814 | struct ctl_table *table = sd_alloc_ctl_entry(14); |
4845 | 4815 | ||
4846 | if (table == NULL) | 4816 | if (table == NULL) |
4847 | return NULL; | 4817 | return NULL; |
@@ -4869,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
4869 | sizeof(int), 0644, proc_dointvec_minmax, false); | 4839 | sizeof(int), 0644, proc_dointvec_minmax, false); |
4870 | set_table_entry(&table[10], "flags", &sd->flags, | 4840 | set_table_entry(&table[10], "flags", &sd->flags, |
4871 | sizeof(int), 0644, proc_dointvec_minmax, false); | 4841 | sizeof(int), 0644, proc_dointvec_minmax, false); |
4872 | set_table_entry(&table[11], "name", sd->name, | 4842 | set_table_entry(&table[11], "max_newidle_lb_cost", |
4843 | &sd->max_newidle_lb_cost, | ||
4844 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
4845 | set_table_entry(&table[12], "name", sd->name, | ||
4873 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | 4846 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
4874 | /* &table[12] is terminator */ | 4847 | /* &table[13] is terminator */ |
4875 | 4848 | ||
4876 | return table; | 4849 | return table; |
4877 | } | 4850 | } |
@@ -7008,7 +6981,7 @@ void normalize_rt_tasks(void) | |||
7008 | * Renice negative nice level userspace | 6981 | * Renice negative nice level userspace |
7009 | * tasks back to 0: | 6982 | * tasks back to 0: |
7010 | */ | 6983 | */ |
7011 | if (TASK_NICE(p) < 0 && p->mm) | 6984 | if (task_nice(p) < 0 && p->mm) |
7012 | set_user_nice(p, 0); | 6985 | set_user_nice(p, 0); |
7013 | continue; | 6986 | continue; |
7014 | } | 6987 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30b..58624a65f124 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
142 | p->utimescaled += cputime_scaled; | 142 | p->utimescaled += cputime_scaled; |
143 | account_group_user_time(p, cputime); | 143 | account_group_user_time(p, cputime); |
144 | 144 | ||
145 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | 145 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
146 | 146 | ||
147 | /* Add user time to cpustat. */ | 147 | /* Add user time to cpustat. */ |
148 | task_group_account_field(p, index, (__force u64) cputime); | 148 | task_group_account_field(p, index, (__force u64) cputime); |
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
169 | p->gtime += cputime; | 169 | p->gtime += cputime; |
170 | 170 | ||
171 | /* Add guest time to cpustat. */ | 171 | /* Add guest time to cpustat. */ |
172 | if (TASK_NICE(p) > 0) { | 172 | if (task_nice(p) > 0) { |
173 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | 173 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
174 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | 174 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; |
175 | } else { | 175 | } else { |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0dd5e0971a07..ed31ef66ab9d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | |||
944 | resched_task(rq->curr); | 944 | resched_task(rq->curr); |
945 | } | 945 | } |
946 | 946 | ||
947 | static int pull_dl_task(struct rq *this_rq); | ||
948 | |||
947 | #endif /* CONFIG_SMP */ | 949 | #endif /* CONFIG_SMP */ |
948 | 950 | ||
949 | /* | 951 | /* |
@@ -990,7 +992,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | |||
990 | return rb_entry(left, struct sched_dl_entity, rb_node); | 992 | return rb_entry(left, struct sched_dl_entity, rb_node); |
991 | } | 993 | } |
992 | 994 | ||
993 | struct task_struct *pick_next_task_dl(struct rq *rq) | 995 | struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) |
994 | { | 996 | { |
995 | struct sched_dl_entity *dl_se; | 997 | struct sched_dl_entity *dl_se; |
996 | struct task_struct *p; | 998 | struct task_struct *p; |
@@ -998,9 +1000,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq) | |||
998 | 1000 | ||
999 | dl_rq = &rq->dl; | 1001 | dl_rq = &rq->dl; |
1000 | 1002 | ||
1003 | #ifdef CONFIG_SMP | ||
1004 | if (dl_task(prev)) | ||
1005 | pull_dl_task(rq); | ||
1006 | #endif | ||
1007 | |||
1001 | if (unlikely(!dl_rq->dl_nr_running)) | 1008 | if (unlikely(!dl_rq->dl_nr_running)) |
1002 | return NULL; | 1009 | return NULL; |
1003 | 1010 | ||
1011 | if (prev) | ||
1012 | prev->sched_class->put_prev_task(rq, prev); | ||
1013 | |||
1004 | dl_se = pick_next_dl_entity(rq, dl_rq); | 1014 | dl_se = pick_next_dl_entity(rq, dl_rq); |
1005 | BUG_ON(!dl_se); | 1015 | BUG_ON(!dl_se); |
1006 | 1016 | ||
@@ -1426,13 +1436,6 @@ skip: | |||
1426 | return ret; | 1436 | return ret; |
1427 | } | 1437 | } |
1428 | 1438 | ||
1429 | static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) | ||
1430 | { | ||
1431 | /* Try to pull other tasks here */ | ||
1432 | if (dl_task(prev)) | ||
1433 | pull_dl_task(rq); | ||
1434 | } | ||
1435 | |||
1436 | static void post_schedule_dl(struct rq *rq) | 1439 | static void post_schedule_dl(struct rq *rq) |
1437 | { | 1440 | { |
1438 | push_dl_tasks(rq); | 1441 | push_dl_tasks(rq); |
@@ -1560,7 +1563,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1560 | if (unlikely(p->dl.dl_throttled)) | 1563 | if (unlikely(p->dl.dl_throttled)) |
1561 | return; | 1564 | return; |
1562 | 1565 | ||
1563 | if (p->on_rq || rq->curr != p) { | 1566 | if (p->on_rq && rq->curr != p) { |
1564 | #ifdef CONFIG_SMP | 1567 | #ifdef CONFIG_SMP |
1565 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1568 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) |
1566 | /* Only reschedule if pushing failed */ | 1569 | /* Only reschedule if pushing failed */ |
@@ -1625,7 +1628,6 @@ const struct sched_class dl_sched_class = { | |||
1625 | .set_cpus_allowed = set_cpus_allowed_dl, | 1628 | .set_cpus_allowed = set_cpus_allowed_dl, |
1626 | .rq_online = rq_online_dl, | 1629 | .rq_online = rq_online_dl, |
1627 | .rq_offline = rq_offline_dl, | 1630 | .rq_offline = rq_offline_dl, |
1628 | .pre_schedule = pre_schedule_dl, | ||
1629 | .post_schedule = post_schedule_dl, | 1631 | .post_schedule = post_schedule_dl, |
1630 | .task_woken = task_woken_dl, | 1632 | .task_woken = task_woken_dl, |
1631 | #endif | 1633 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10e..f3344c31632a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -321,6 +321,7 @@ do { \ | |||
321 | P(sched_goidle); | 321 | P(sched_goidle); |
322 | #ifdef CONFIG_SMP | 322 | #ifdef CONFIG_SMP |
323 | P64(avg_idle); | 323 | P64(avg_idle); |
324 | P64(max_idle_balance_cost); | ||
324 | #endif | 325 | #endif |
325 | 326 | ||
326 | P(ttwu_count); | 327 | P(ttwu_count); |
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) | |||
533 | unsigned long nr_faults = -1; | 534 | unsigned long nr_faults = -1; |
534 | int cpu_current, home_node; | 535 | int cpu_current, home_node; |
535 | 536 | ||
536 | if (p->numa_faults) | 537 | if (p->numa_faults_memory) |
537 | nr_faults = p->numa_faults[2*node + i]; | 538 | nr_faults = p->numa_faults_memory[2*node + i]; |
538 | 539 | ||
539 | cpu_current = !i ? (task_node(p) == node) : | 540 | cpu_current = !i ? (task_node(p) == node) : |
540 | (pol && node_isset(node, pol->v.nodes)); | 541 | (pol && node_isset(node, pol->v.nodes)); |
541 | 542 | ||
542 | home_node = (p->numa_preferred_nid == node); | 543 | home_node = (p->numa_preferred_nid == node); |
543 | 544 | ||
544 | SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", | 545 | SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", |
545 | i, node, cpu_current, home_node, nr_faults); | 546 | i, node, cpu_current, home_node, nr_faults); |
546 | } | 547 | } |
547 | } | 548 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966cc2bfcb77..235cfa7ad8fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
322 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 322 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
323 | 323 | ||
324 | /* Do the two (enqueued) entities belong to the same group ? */ | 324 | /* Do the two (enqueued) entities belong to the same group ? */ |
325 | static inline int | 325 | static inline struct cfs_rq * |
326 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | 326 | is_same_group(struct sched_entity *se, struct sched_entity *pse) |
327 | { | 327 | { |
328 | if (se->cfs_rq == pse->cfs_rq) | 328 | if (se->cfs_rq == pse->cfs_rq) |
329 | return 1; | 329 | return se->cfs_rq; |
330 | 330 | ||
331 | return 0; | 331 | return NULL; |
332 | } | 332 | } |
333 | 333 | ||
334 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | 334 | static inline struct sched_entity *parent_entity(struct sched_entity *se) |
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
336 | return se->parent; | 336 | return se->parent; |
337 | } | 337 | } |
338 | 338 | ||
339 | /* return depth at which a sched entity is present in the hierarchy */ | ||
340 | static inline int depth_se(struct sched_entity *se) | ||
341 | { | ||
342 | int depth = 0; | ||
343 | |||
344 | for_each_sched_entity(se) | ||
345 | depth++; | ||
346 | |||
347 | return depth; | ||
348 | } | ||
349 | |||
350 | static void | 339 | static void |
351 | find_matching_se(struct sched_entity **se, struct sched_entity **pse) | 340 | find_matching_se(struct sched_entity **se, struct sched_entity **pse) |
352 | { | 341 | { |
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
360 | */ | 349 | */ |
361 | 350 | ||
362 | /* First walk up until both entities are at same depth */ | 351 | /* First walk up until both entities are at same depth */ |
363 | se_depth = depth_se(*se); | 352 | se_depth = (*se)->depth; |
364 | pse_depth = depth_se(*pse); | 353 | pse_depth = (*pse)->depth; |
365 | 354 | ||
366 | while (se_depth > pse_depth) { | 355 | while (se_depth > pse_depth) { |
367 | se_depth--; | 356 | se_depth--; |
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
426 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 415 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
427 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 416 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
428 | 417 | ||
429 | static inline int | ||
430 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
431 | { | ||
432 | return 1; | ||
433 | } | ||
434 | |||
435 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | 418 | static inline struct sched_entity *parent_entity(struct sched_entity *se) |
436 | { | 419 | { |
437 | return NULL; | 420 | return NULL; |
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
819 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 802 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
820 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 803 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
821 | 804 | ||
822 | /* | ||
823 | * After skipping a page migration on a shared page, skip N more numa page | ||
824 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
825 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
826 | * where their memory lives, over pulling the memory towards the task. | ||
827 | */ | ||
828 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
829 | |||
830 | static unsigned int task_nr_scan_windows(struct task_struct *p) | 805 | static unsigned int task_nr_scan_windows(struct task_struct *p) |
831 | { | 806 | { |
832 | unsigned long rss = 0; | 807 | unsigned long rss = 0; |
@@ -893,10 +868,26 @@ struct numa_group { | |||
893 | struct list_head task_list; | 868 | struct list_head task_list; |
894 | 869 | ||
895 | struct rcu_head rcu; | 870 | struct rcu_head rcu; |
871 | nodemask_t active_nodes; | ||
896 | unsigned long total_faults; | 872 | unsigned long total_faults; |
873 | /* | ||
874 | * Faults_cpu is used to decide whether memory should move | ||
875 | * towards the CPU. As a consequence, these stats are weighted | ||
876 | * more by CPU use than by memory faults. | ||
877 | */ | ||
878 | unsigned long *faults_cpu; | ||
897 | unsigned long faults[0]; | 879 | unsigned long faults[0]; |
898 | }; | 880 | }; |
899 | 881 | ||
882 | /* Shared or private faults. */ | ||
883 | #define NR_NUMA_HINT_FAULT_TYPES 2 | ||
884 | |||
885 | /* Memory and CPU locality */ | ||
886 | #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) | ||
887 | |||
888 | /* Averaged statistics, and temporary buffers. */ | ||
889 | #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) | ||
890 | |||
900 | pid_t task_numa_group_id(struct task_struct *p) | 891 | pid_t task_numa_group_id(struct task_struct *p) |
901 | { | 892 | { |
902 | return p->numa_group ? p->numa_group->gid : 0; | 893 | return p->numa_group ? p->numa_group->gid : 0; |
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
904 | 895 | ||
905 | static inline int task_faults_idx(int nid, int priv) | 896 | static inline int task_faults_idx(int nid, int priv) |
906 | { | 897 | { |
907 | return 2 * nid + priv; | 898 | return NR_NUMA_HINT_FAULT_TYPES * nid + priv; |
908 | } | 899 | } |
909 | 900 | ||
910 | static inline unsigned long task_faults(struct task_struct *p, int nid) | 901 | static inline unsigned long task_faults(struct task_struct *p, int nid) |
911 | { | 902 | { |
912 | if (!p->numa_faults) | 903 | if (!p->numa_faults_memory) |
913 | return 0; | 904 | return 0; |
914 | 905 | ||
915 | return p->numa_faults[task_faults_idx(nid, 0)] + | 906 | return p->numa_faults_memory[task_faults_idx(nid, 0)] + |
916 | p->numa_faults[task_faults_idx(nid, 1)]; | 907 | p->numa_faults_memory[task_faults_idx(nid, 1)]; |
917 | } | 908 | } |
918 | 909 | ||
919 | static inline unsigned long group_faults(struct task_struct *p, int nid) | 910 | static inline unsigned long group_faults(struct task_struct *p, int nid) |
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | 916 | p->numa_group->faults[task_faults_idx(nid, 1)]; |
926 | } | 917 | } |
927 | 918 | ||
919 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | ||
920 | { | ||
921 | return group->faults_cpu[task_faults_idx(nid, 0)] + | ||
922 | group->faults_cpu[task_faults_idx(nid, 1)]; | ||
923 | } | ||
924 | |||
928 | /* | 925 | /* |
929 | * These return the fraction of accesses done by a particular task, or | 926 | * These return the fraction of accesses done by a particular task, or |
930 | * task group, on a particular numa node. The group weight is given a | 927 | * task group, on a particular numa node. The group weight is given a |
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) | |||
935 | { | 932 | { |
936 | unsigned long total_faults; | 933 | unsigned long total_faults; |
937 | 934 | ||
938 | if (!p->numa_faults) | 935 | if (!p->numa_faults_memory) |
939 | return 0; | 936 | return 0; |
940 | 937 | ||
941 | total_faults = p->total_numa_faults; | 938 | total_faults = p->total_numa_faults; |
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) | |||
954 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | 951 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; |
955 | } | 952 | } |
956 | 953 | ||
954 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | ||
955 | int src_nid, int dst_cpu) | ||
956 | { | ||
957 | struct numa_group *ng = p->numa_group; | ||
958 | int dst_nid = cpu_to_node(dst_cpu); | ||
959 | int last_cpupid, this_cpupid; | ||
960 | |||
961 | this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); | ||
962 | |||
963 | /* | ||
964 | * Multi-stage node selection is used in conjunction with a periodic | ||
965 | * migration fault to build a temporal task<->page relation. By using | ||
966 | * a two-stage filter we remove short/unlikely relations. | ||
967 | * | ||
968 | * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate | ||
969 | * a task's usage of a particular page (n_p) per total usage of this | ||
970 | * page (n_t) (in a given time-span) to a probability. | ||
971 | * | ||
972 | * Our periodic faults will sample this probability and getting the | ||
973 | * same result twice in a row, given these samples are fully | ||
974 | * independent, is then given by P(n)^2, provided our sample period | ||
975 | * is sufficiently short compared to the usage pattern. | ||
976 | * | ||
977 | * This quadric squishes small probabilities, making it less likely we | ||
978 | * act on an unlikely task<->page relation. | ||
979 | */ | ||
980 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); | ||
981 | if (!cpupid_pid_unset(last_cpupid) && | ||
982 | cpupid_to_nid(last_cpupid) != dst_nid) | ||
983 | return false; | ||
984 | |||
985 | /* Always allow migrate on private faults */ | ||
986 | if (cpupid_match_pid(p, last_cpupid)) | ||
987 | return true; | ||
988 | |||
989 | /* A shared fault, but p->numa_group has not been set up yet. */ | ||
990 | if (!ng) | ||
991 | return true; | ||
992 | |||
993 | /* | ||
994 | * Do not migrate if the destination is not a node that | ||
995 | * is actively used by this numa group. | ||
996 | */ | ||
997 | if (!node_isset(dst_nid, ng->active_nodes)) | ||
998 | return false; | ||
999 | |||
1000 | /* | ||
1001 | * Source is a node that is not actively used by this | ||
1002 | * numa group, while the destination is. Migrate. | ||
1003 | */ | ||
1004 | if (!node_isset(src_nid, ng->active_nodes)) | ||
1005 | return true; | ||
1006 | |||
1007 | /* | ||
1008 | * Both source and destination are nodes in active | ||
1009 | * use by this numa group. Maximize memory bandwidth | ||
1010 | * by migrating from more heavily used groups, to less | ||
1011 | * heavily used ones, spreading the load around. | ||
1012 | * Use a 1/4 hysteresis to avoid spurious page movement. | ||
1013 | */ | ||
1014 | return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); | ||
1015 | } | ||
1016 | |||
957 | static unsigned long weighted_cpuload(const int cpu); | 1017 | static unsigned long weighted_cpuload(const int cpu); |
958 | static unsigned long source_load(int cpu, int type); | 1018 | static unsigned long source_load(int cpu, int type); |
959 | static unsigned long target_load(int cpu, int type); | 1019 | static unsigned long target_load(int cpu, int type); |
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1267 | static void numa_migrate_preferred(struct task_struct *p) | 1327 | static void numa_migrate_preferred(struct task_struct *p) |
1268 | { | 1328 | { |
1269 | /* This task has no NUMA fault statistics yet */ | 1329 | /* This task has no NUMA fault statistics yet */ |
1270 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1330 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) |
1271 | return; | 1331 | return; |
1272 | 1332 | ||
1273 | /* Periodically retry migrating the task to the preferred node */ | 1333 | /* Periodically retry migrating the task to the preferred node */ |
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1282 | } | 1342 | } |
1283 | 1343 | ||
1284 | /* | 1344 | /* |
1345 | * Find the nodes on which the workload is actively running. We do this by | ||
1346 | * tracking the nodes from which NUMA hinting faults are triggered. This can | ||
1347 | * be different from the set of nodes where the workload's memory is currently | ||
1348 | * located. | ||
1349 | * | ||
1350 | * The bitmask is used to make smarter decisions on when to do NUMA page | ||
1351 | * migrations, To prevent flip-flopping, and excessive page migrations, nodes | ||
1352 | * are added when they cause over 6/16 of the maximum number of faults, but | ||
1353 | * only removed when they drop below 3/16. | ||
1354 | */ | ||
1355 | static void update_numa_active_node_mask(struct numa_group *numa_group) | ||
1356 | { | ||
1357 | unsigned long faults, max_faults = 0; | ||
1358 | int nid; | ||
1359 | |||
1360 | for_each_online_node(nid) { | ||
1361 | faults = group_faults_cpu(numa_group, nid); | ||
1362 | if (faults > max_faults) | ||
1363 | max_faults = faults; | ||
1364 | } | ||
1365 | |||
1366 | for_each_online_node(nid) { | ||
1367 | faults = group_faults_cpu(numa_group, nid); | ||
1368 | if (!node_isset(nid, numa_group->active_nodes)) { | ||
1369 | if (faults > max_faults * 6 / 16) | ||
1370 | node_set(nid, numa_group->active_nodes); | ||
1371 | } else if (faults < max_faults * 3 / 16) | ||
1372 | node_clear(nid, numa_group->active_nodes); | ||
1373 | } | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1285 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | 1377 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS |
1286 | * increments. The more local the fault statistics are, the higher the scan | 1378 | * increments. The more local the fault statistics are, the higher the scan |
1287 | * period will be for the next scan window. If local/remote ratio is below | 1379 | * period will be for the next scan window. If local/remote ratio is below |
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p, | |||
1355 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 1447 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
1356 | } | 1448 | } |
1357 | 1449 | ||
1450 | /* | ||
1451 | * Get the fraction of time the task has been running since the last | ||
1452 | * NUMA placement cycle. The scheduler keeps similar statistics, but | ||
1453 | * decays those on a 32ms period, which is orders of magnitude off | ||
1454 | * from the dozens-of-seconds NUMA balancing period. Use the scheduler | ||
1455 | * stats only if the task is so new there are no NUMA statistics yet. | ||
1456 | */ | ||
1457 | static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | ||
1458 | { | ||
1459 | u64 runtime, delta, now; | ||
1460 | /* Use the start of this time slice to avoid calculations. */ | ||
1461 | now = p->se.exec_start; | ||
1462 | runtime = p->se.sum_exec_runtime; | ||
1463 | |||
1464 | if (p->last_task_numa_placement) { | ||
1465 | delta = runtime - p->last_sum_exec_runtime; | ||
1466 | *period = now - p->last_task_numa_placement; | ||
1467 | } else { | ||
1468 | delta = p->se.avg.runnable_avg_sum; | ||
1469 | *period = p->se.avg.runnable_avg_period; | ||
1470 | } | ||
1471 | |||
1472 | p->last_sum_exec_runtime = runtime; | ||
1473 | p->last_task_numa_placement = now; | ||
1474 | |||
1475 | return delta; | ||
1476 | } | ||
1477 | |||
1358 | static void task_numa_placement(struct task_struct *p) | 1478 | static void task_numa_placement(struct task_struct *p) |
1359 | { | 1479 | { |
1360 | int seq, nid, max_nid = -1, max_group_nid = -1; | 1480 | int seq, nid, max_nid = -1, max_group_nid = -1; |
1361 | unsigned long max_faults = 0, max_group_faults = 0; | 1481 | unsigned long max_faults = 0, max_group_faults = 0; |
1362 | unsigned long fault_types[2] = { 0, 0 }; | 1482 | unsigned long fault_types[2] = { 0, 0 }; |
1483 | unsigned long total_faults; | ||
1484 | u64 runtime, period; | ||
1363 | spinlock_t *group_lock = NULL; | 1485 | spinlock_t *group_lock = NULL; |
1364 | 1486 | ||
1365 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1487 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p) | |||
1368 | p->numa_scan_seq = seq; | 1490 | p->numa_scan_seq = seq; |
1369 | p->numa_scan_period_max = task_scan_max(p); | 1491 | p->numa_scan_period_max = task_scan_max(p); |
1370 | 1492 | ||
1493 | total_faults = p->numa_faults_locality[0] + | ||
1494 | p->numa_faults_locality[1]; | ||
1495 | runtime = numa_get_avg_runtime(p, &period); | ||
1496 | |||
1371 | /* If the task is part of a group prevent parallel updates to group stats */ | 1497 | /* If the task is part of a group prevent parallel updates to group stats */ |
1372 | if (p->numa_group) { | 1498 | if (p->numa_group) { |
1373 | group_lock = &p->numa_group->lock; | 1499 | group_lock = &p->numa_group->lock; |
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p) | |||
1379 | unsigned long faults = 0, group_faults = 0; | 1505 | unsigned long faults = 0, group_faults = 0; |
1380 | int priv, i; | 1506 | int priv, i; |
1381 | 1507 | ||
1382 | for (priv = 0; priv < 2; priv++) { | 1508 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { |
1383 | long diff; | 1509 | long diff, f_diff, f_weight; |
1384 | 1510 | ||
1385 | i = task_faults_idx(nid, priv); | 1511 | i = task_faults_idx(nid, priv); |
1386 | diff = -p->numa_faults[i]; | ||
1387 | 1512 | ||
1388 | /* Decay existing window, copy faults since last scan */ | 1513 | /* Decay existing window, copy faults since last scan */ |
1389 | p->numa_faults[i] >>= 1; | 1514 | diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; |
1390 | p->numa_faults[i] += p->numa_faults_buffer[i]; | 1515 | fault_types[priv] += p->numa_faults_buffer_memory[i]; |
1391 | fault_types[priv] += p->numa_faults_buffer[i]; | 1516 | p->numa_faults_buffer_memory[i] = 0; |
1392 | p->numa_faults_buffer[i] = 0; | ||
1393 | 1517 | ||
1394 | faults += p->numa_faults[i]; | 1518 | /* |
1395 | diff += p->numa_faults[i]; | 1519 | * Normalize the faults_from, so all tasks in a group |
1520 | * count according to CPU use, instead of by the raw | ||
1521 | * number of faults. Tasks with little runtime have | ||
1522 | * little over-all impact on throughput, and thus their | ||
1523 | * faults are less important. | ||
1524 | */ | ||
1525 | f_weight = div64_u64(runtime << 16, period + 1); | ||
1526 | f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / | ||
1527 | (total_faults + 1); | ||
1528 | f_diff = f_weight - p->numa_faults_cpu[i] / 2; | ||
1529 | p->numa_faults_buffer_cpu[i] = 0; | ||
1530 | |||
1531 | p->numa_faults_memory[i] += diff; | ||
1532 | p->numa_faults_cpu[i] += f_diff; | ||
1533 | faults += p->numa_faults_memory[i]; | ||
1396 | p->total_numa_faults += diff; | 1534 | p->total_numa_faults += diff; |
1397 | if (p->numa_group) { | 1535 | if (p->numa_group) { |
1398 | /* safe because we can only change our own group */ | 1536 | /* safe because we can only change our own group */ |
1399 | p->numa_group->faults[i] += diff; | 1537 | p->numa_group->faults[i] += diff; |
1538 | p->numa_group->faults_cpu[i] += f_diff; | ||
1400 | p->numa_group->total_faults += diff; | 1539 | p->numa_group->total_faults += diff; |
1401 | group_faults += p->numa_group->faults[i]; | 1540 | group_faults += p->numa_group->faults[i]; |
1402 | } | 1541 | } |
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p) | |||
1416 | update_task_scan_period(p, fault_types[0], fault_types[1]); | 1555 | update_task_scan_period(p, fault_types[0], fault_types[1]); |
1417 | 1556 | ||
1418 | if (p->numa_group) { | 1557 | if (p->numa_group) { |
1558 | update_numa_active_node_mask(p->numa_group); | ||
1419 | /* | 1559 | /* |
1420 | * If the preferred task and group nids are different, | 1560 | * If the preferred task and group nids are different, |
1421 | * iterate over the nodes again to find the best place. | 1561 | * iterate over the nodes again to find the best place. |
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1465 | 1605 | ||
1466 | if (unlikely(!p->numa_group)) { | 1606 | if (unlikely(!p->numa_group)) { |
1467 | unsigned int size = sizeof(struct numa_group) + | 1607 | unsigned int size = sizeof(struct numa_group) + |
1468 | 2*nr_node_ids*sizeof(unsigned long); | 1608 | 4*nr_node_ids*sizeof(unsigned long); |
1469 | 1609 | ||
1470 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | 1610 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); |
1471 | if (!grp) | 1611 | if (!grp) |
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1475 | spin_lock_init(&grp->lock); | 1615 | spin_lock_init(&grp->lock); |
1476 | INIT_LIST_HEAD(&grp->task_list); | 1616 | INIT_LIST_HEAD(&grp->task_list); |
1477 | grp->gid = p->pid; | 1617 | grp->gid = p->pid; |
1618 | /* Second half of the array tracks nids where faults happen */ | ||
1619 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | ||
1620 | nr_node_ids; | ||
1621 | |||
1622 | node_set(task_node(current), grp->active_nodes); | ||
1478 | 1623 | ||
1479 | for (i = 0; i < 2*nr_node_ids; i++) | 1624 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
1480 | grp->faults[i] = p->numa_faults[i]; | 1625 | grp->faults[i] = p->numa_faults_memory[i]; |
1481 | 1626 | ||
1482 | grp->total_faults = p->total_numa_faults; | 1627 | grp->total_faults = p->total_numa_faults; |
1483 | 1628 | ||
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1534 | 1679 | ||
1535 | double_lock(&my_grp->lock, &grp->lock); | 1680 | double_lock(&my_grp->lock, &grp->lock); |
1536 | 1681 | ||
1537 | for (i = 0; i < 2*nr_node_ids; i++) { | 1682 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { |
1538 | my_grp->faults[i] -= p->numa_faults[i]; | 1683 | my_grp->faults[i] -= p->numa_faults_memory[i]; |
1539 | grp->faults[i] += p->numa_faults[i]; | 1684 | grp->faults[i] += p->numa_faults_memory[i]; |
1540 | } | 1685 | } |
1541 | my_grp->total_faults -= p->total_numa_faults; | 1686 | my_grp->total_faults -= p->total_numa_faults; |
1542 | grp->total_faults += p->total_numa_faults; | 1687 | grp->total_faults += p->total_numa_faults; |
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p) | |||
1562 | { | 1707 | { |
1563 | struct numa_group *grp = p->numa_group; | 1708 | struct numa_group *grp = p->numa_group; |
1564 | int i; | 1709 | int i; |
1565 | void *numa_faults = p->numa_faults; | 1710 | void *numa_faults = p->numa_faults_memory; |
1566 | 1711 | ||
1567 | if (grp) { | 1712 | if (grp) { |
1568 | spin_lock(&grp->lock); | 1713 | spin_lock(&grp->lock); |
1569 | for (i = 0; i < 2*nr_node_ids; i++) | 1714 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
1570 | grp->faults[i] -= p->numa_faults[i]; | 1715 | grp->faults[i] -= p->numa_faults_memory[i]; |
1571 | grp->total_faults -= p->total_numa_faults; | 1716 | grp->total_faults -= p->total_numa_faults; |
1572 | 1717 | ||
1573 | list_del(&p->numa_entry); | 1718 | list_del(&p->numa_entry); |
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p) | |||
1577 | put_numa_group(grp); | 1722 | put_numa_group(grp); |
1578 | } | 1723 | } |
1579 | 1724 | ||
1580 | p->numa_faults = NULL; | 1725 | p->numa_faults_memory = NULL; |
1581 | p->numa_faults_buffer = NULL; | 1726 | p->numa_faults_buffer_memory = NULL; |
1727 | p->numa_faults_cpu= NULL; | ||
1728 | p->numa_faults_buffer_cpu = NULL; | ||
1582 | kfree(numa_faults); | 1729 | kfree(numa_faults); |
1583 | } | 1730 | } |
1584 | 1731 | ||
1585 | /* | 1732 | /* |
1586 | * Got a PROT_NONE fault for a page on @node. | 1733 | * Got a PROT_NONE fault for a page on @node. |
1587 | */ | 1734 | */ |
1588 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) | 1735 | void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) |
1589 | { | 1736 | { |
1590 | struct task_struct *p = current; | 1737 | struct task_struct *p = current; |
1591 | bool migrated = flags & TNF_MIGRATED; | 1738 | bool migrated = flags & TNF_MIGRATED; |
1739 | int cpu_node = task_node(current); | ||
1592 | int priv; | 1740 | int priv; |
1593 | 1741 | ||
1594 | if (!numabalancing_enabled) | 1742 | if (!numabalancing_enabled) |
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1603 | return; | 1751 | return; |
1604 | 1752 | ||
1605 | /* Allocate buffer to track faults on a per-node basis */ | 1753 | /* Allocate buffer to track faults on a per-node basis */ |
1606 | if (unlikely(!p->numa_faults)) { | 1754 | if (unlikely(!p->numa_faults_memory)) { |
1607 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | 1755 | int size = sizeof(*p->numa_faults_memory) * |
1756 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; | ||
1608 | 1757 | ||
1609 | /* numa_faults and numa_faults_buffer share the allocation */ | 1758 | p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); |
1610 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | 1759 | if (!p->numa_faults_memory) |
1611 | if (!p->numa_faults) | ||
1612 | return; | 1760 | return; |
1613 | 1761 | ||
1614 | BUG_ON(p->numa_faults_buffer); | 1762 | BUG_ON(p->numa_faults_buffer_memory); |
1615 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | 1763 | /* |
1764 | * The averaged statistics, shared & private, memory & cpu, | ||
1765 | * occupy the first half of the array. The second half of the | ||
1766 | * array is for current counters, which are averaged into the | ||
1767 | * first set by task_numa_placement. | ||
1768 | */ | ||
1769 | p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); | ||
1770 | p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); | ||
1771 | p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); | ||
1616 | p->total_numa_faults = 0; | 1772 | p->total_numa_faults = 0; |
1617 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 1773 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
1618 | } | 1774 | } |
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1641 | if (migrated) | 1797 | if (migrated) |
1642 | p->numa_pages_migrated += pages; | 1798 | p->numa_pages_migrated += pages; |
1643 | 1799 | ||
1644 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | 1800 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; |
1801 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | ||
1645 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | 1802 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; |
1646 | } | 1803 | } |
1647 | 1804 | ||
@@ -2414,7 +2571,8 @@ void idle_exit_fair(struct rq *this_rq) | |||
2414 | update_rq_runnable_avg(this_rq, 0); | 2571 | update_rq_runnable_avg(this_rq, 0); |
2415 | } | 2572 | } |
2416 | 2573 | ||
2417 | #else | 2574 | #else /* CONFIG_SMP */ |
2575 | |||
2418 | static inline void update_entity_load_avg(struct sched_entity *se, | 2576 | static inline void update_entity_load_avg(struct sched_entity *se, |
2419 | int update_cfs_rq) {} | 2577 | int update_cfs_rq) {} |
2420 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | 2578 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} |
@@ -2426,7 +2584,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2426 | int sleep) {} | 2584 | int sleep) {} |
2427 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | 2585 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, |
2428 | int force_update) {} | 2586 | int force_update) {} |
2429 | #endif | 2587 | #endif /* CONFIG_SMP */ |
2430 | 2588 | ||
2431 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2589 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2432 | { | 2590 | { |
@@ -2576,10 +2734,10 @@ static void __clear_buddies_last(struct sched_entity *se) | |||
2576 | { | 2734 | { |
2577 | for_each_sched_entity(se) { | 2735 | for_each_sched_entity(se) { |
2578 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2736 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2579 | if (cfs_rq->last == se) | 2737 | if (cfs_rq->last != se) |
2580 | cfs_rq->last = NULL; | ||
2581 | else | ||
2582 | break; | 2738 | break; |
2739 | |||
2740 | cfs_rq->last = NULL; | ||
2583 | } | 2741 | } |
2584 | } | 2742 | } |
2585 | 2743 | ||
@@ -2587,10 +2745,10 @@ static void __clear_buddies_next(struct sched_entity *se) | |||
2587 | { | 2745 | { |
2588 | for_each_sched_entity(se) { | 2746 | for_each_sched_entity(se) { |
2589 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2747 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2590 | if (cfs_rq->next == se) | 2748 | if (cfs_rq->next != se) |
2591 | cfs_rq->next = NULL; | ||
2592 | else | ||
2593 | break; | 2749 | break; |
2750 | |||
2751 | cfs_rq->next = NULL; | ||
2594 | } | 2752 | } |
2595 | } | 2753 | } |
2596 | 2754 | ||
@@ -2598,10 +2756,10 @@ static void __clear_buddies_skip(struct sched_entity *se) | |||
2598 | { | 2756 | { |
2599 | for_each_sched_entity(se) { | 2757 | for_each_sched_entity(se) { |
2600 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2758 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2601 | if (cfs_rq->skip == se) | 2759 | if (cfs_rq->skip != se) |
2602 | cfs_rq->skip = NULL; | ||
2603 | else | ||
2604 | break; | 2760 | break; |
2761 | |||
2762 | cfs_rq->skip = NULL; | ||
2605 | } | 2763 | } |
2606 | } | 2764 | } |
2607 | 2765 | ||
@@ -2744,17 +2902,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | |||
2744 | * 3) pick the "last" process, for cache locality | 2902 | * 3) pick the "last" process, for cache locality |
2745 | * 4) do not run the "skip" process, if something else is available | 2903 | * 4) do not run the "skip" process, if something else is available |
2746 | */ | 2904 | */ |
2747 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 2905 | static struct sched_entity * |
2906 | pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||
2748 | { | 2907 | { |
2749 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 2908 | struct sched_entity *left = __pick_first_entity(cfs_rq); |
2750 | struct sched_entity *left = se; | 2909 | struct sched_entity *se; |
2910 | |||
2911 | /* | ||
2912 | * If curr is set we have to see if its left of the leftmost entity | ||
2913 | * still in the tree, provided there was anything in the tree at all. | ||
2914 | */ | ||
2915 | if (!left || (curr && entity_before(curr, left))) | ||
2916 | left = curr; | ||
2917 | |||
2918 | se = left; /* ideally we run the leftmost entity */ | ||
2751 | 2919 | ||
2752 | /* | 2920 | /* |
2753 | * Avoid running the skip buddy, if running something else can | 2921 | * Avoid running the skip buddy, if running something else can |
2754 | * be done without getting too unfair. | 2922 | * be done without getting too unfair. |
2755 | */ | 2923 | */ |
2756 | if (cfs_rq->skip == se) { | 2924 | if (cfs_rq->skip == se) { |
2757 | struct sched_entity *second = __pick_next_entity(se); | 2925 | struct sched_entity *second; |
2926 | |||
2927 | if (se == curr) { | ||
2928 | second = __pick_first_entity(cfs_rq); | ||
2929 | } else { | ||
2930 | second = __pick_next_entity(se); | ||
2931 | if (!second || (curr && entity_before(curr, second))) | ||
2932 | second = curr; | ||
2933 | } | ||
2934 | |||
2758 | if (second && wakeup_preempt_entity(second, left) < 1) | 2935 | if (second && wakeup_preempt_entity(second, left) < 1) |
2759 | se = second; | 2936 | se = second; |
2760 | } | 2937 | } |
@@ -2776,7 +2953,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
2776 | return se; | 2953 | return se; |
2777 | } | 2954 | } |
2778 | 2955 | ||
2779 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | 2956 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); |
2780 | 2957 | ||
2781 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 2958 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
2782 | { | 2959 | { |
@@ -3431,22 +3608,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
3431 | } | 3608 | } |
3432 | 3609 | ||
3433 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 3610 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
3434 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 3611 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
3435 | { | 3612 | { |
3436 | if (!cfs_bandwidth_used()) | 3613 | if (!cfs_bandwidth_used()) |
3437 | return; | 3614 | return false; |
3438 | 3615 | ||
3439 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | 3616 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) |
3440 | return; | 3617 | return false; |
3441 | 3618 | ||
3442 | /* | 3619 | /* |
3443 | * it's possible for a throttled entity to be forced into a running | 3620 | * it's possible for a throttled entity to be forced into a running |
3444 | * state (e.g. set_curr_task), in this case we're finished. | 3621 | * state (e.g. set_curr_task), in this case we're finished. |
3445 | */ | 3622 | */ |
3446 | if (cfs_rq_throttled(cfs_rq)) | 3623 | if (cfs_rq_throttled(cfs_rq)) |
3447 | return; | 3624 | return true; |
3448 | 3625 | ||
3449 | throttle_cfs_rq(cfs_rq); | 3626 | throttle_cfs_rq(cfs_rq); |
3627 | return true; | ||
3450 | } | 3628 | } |
3451 | 3629 | ||
3452 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | 3630 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) |
@@ -3556,7 +3734,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
3556 | } | 3734 | } |
3557 | 3735 | ||
3558 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} | 3736 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
3559 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3737 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } |
3560 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 3738 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
3561 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3739 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
3562 | 3740 | ||
@@ -4492,26 +4670,125 @@ preempt: | |||
4492 | set_last_buddy(se); | 4670 | set_last_buddy(se); |
4493 | } | 4671 | } |
4494 | 4672 | ||
4495 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 4673 | static struct task_struct * |
4674 | pick_next_task_fair(struct rq *rq, struct task_struct *prev) | ||
4496 | { | 4675 | { |
4497 | struct task_struct *p; | ||
4498 | struct cfs_rq *cfs_rq = &rq->cfs; | 4676 | struct cfs_rq *cfs_rq = &rq->cfs; |
4499 | struct sched_entity *se; | 4677 | struct sched_entity *se; |
4678 | struct task_struct *p; | ||
4500 | 4679 | ||
4680 | again: __maybe_unused | ||
4681 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
4501 | if (!cfs_rq->nr_running) | 4682 | if (!cfs_rq->nr_running) |
4502 | return NULL; | 4683 | goto idle; |
4684 | |||
4685 | if (!prev || prev->sched_class != &fair_sched_class) | ||
4686 | goto simple; | ||
4687 | |||
4688 | /* | ||
4689 | * Because of the set_next_buddy() in dequeue_task_fair() it is rather | ||
4690 | * likely that a next task is from the same cgroup as the current. | ||
4691 | * | ||
4692 | * Therefore attempt to avoid putting and setting the entire cgroup | ||
4693 | * hierarchy, only change the part that actually changes. | ||
4694 | */ | ||
4503 | 4695 | ||
4504 | do { | 4696 | do { |
4505 | se = pick_next_entity(cfs_rq); | 4697 | struct sched_entity *curr = cfs_rq->curr; |
4698 | |||
4699 | /* | ||
4700 | * Since we got here without doing put_prev_entity() we also | ||
4701 | * have to consider cfs_rq->curr. If it is still a runnable | ||
4702 | * entity, update_curr() will update its vruntime, otherwise | ||
4703 | * forget we've ever seen it. | ||
4704 | */ | ||
4705 | if (curr && curr->on_rq) | ||
4706 | update_curr(cfs_rq); | ||
4707 | else | ||
4708 | curr = NULL; | ||
4709 | |||
4710 | /* | ||
4711 | * This call to check_cfs_rq_runtime() will do the throttle and | ||
4712 | * dequeue its entity in the parent(s). Therefore the 'simple' | ||
4713 | * nr_running test will indeed be correct. | ||
4714 | */ | ||
4715 | if (unlikely(check_cfs_rq_runtime(cfs_rq))) | ||
4716 | goto simple; | ||
4717 | |||
4718 | se = pick_next_entity(cfs_rq, curr); | ||
4719 | cfs_rq = group_cfs_rq(se); | ||
4720 | } while (cfs_rq); | ||
4721 | |||
4722 | p = task_of(se); | ||
4723 | |||
4724 | /* | ||
4725 | * Since we haven't yet done put_prev_entity and if the selected task | ||
4726 | * is a different task than we started out with, try and touch the | ||
4727 | * least amount of cfs_rqs. | ||
4728 | */ | ||
4729 | if (prev != p) { | ||
4730 | struct sched_entity *pse = &prev->se; | ||
4731 | |||
4732 | while (!(cfs_rq = is_same_group(se, pse))) { | ||
4733 | int se_depth = se->depth; | ||
4734 | int pse_depth = pse->depth; | ||
4735 | |||
4736 | if (se_depth <= pse_depth) { | ||
4737 | put_prev_entity(cfs_rq_of(pse), pse); | ||
4738 | pse = parent_entity(pse); | ||
4739 | } | ||
4740 | if (se_depth >= pse_depth) { | ||
4741 | set_next_entity(cfs_rq_of(se), se); | ||
4742 | se = parent_entity(se); | ||
4743 | } | ||
4744 | } | ||
4745 | |||
4746 | put_prev_entity(cfs_rq, pse); | ||
4747 | set_next_entity(cfs_rq, se); | ||
4748 | } | ||
4749 | |||
4750 | if (hrtick_enabled(rq)) | ||
4751 | hrtick_start_fair(rq, p); | ||
4752 | |||
4753 | return p; | ||
4754 | simple: | ||
4755 | cfs_rq = &rq->cfs; | ||
4756 | #endif | ||
4757 | |||
4758 | if (!cfs_rq->nr_running) | ||
4759 | goto idle; | ||
4760 | |||
4761 | if (prev) | ||
4762 | prev->sched_class->put_prev_task(rq, prev); | ||
4763 | |||
4764 | do { | ||
4765 | se = pick_next_entity(cfs_rq, NULL); | ||
4506 | set_next_entity(cfs_rq, se); | 4766 | set_next_entity(cfs_rq, se); |
4507 | cfs_rq = group_cfs_rq(se); | 4767 | cfs_rq = group_cfs_rq(se); |
4508 | } while (cfs_rq); | 4768 | } while (cfs_rq); |
4509 | 4769 | ||
4510 | p = task_of(se); | 4770 | p = task_of(se); |
4771 | |||
4511 | if (hrtick_enabled(rq)) | 4772 | if (hrtick_enabled(rq)) |
4512 | hrtick_start_fair(rq, p); | 4773 | hrtick_start_fair(rq, p); |
4513 | 4774 | ||
4514 | return p; | 4775 | return p; |
4776 | |||
4777 | idle: | ||
4778 | #ifdef CONFIG_SMP | ||
4779 | idle_enter_fair(rq); | ||
4780 | /* | ||
4781 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
4782 | * measure the duration of idle_balance() as idle time. | ||
4783 | */ | ||
4784 | rq->idle_stamp = rq_clock(rq); | ||
4785 | if (idle_balance(rq)) { /* drops rq->lock */ | ||
4786 | rq->idle_stamp = 0; | ||
4787 | goto again; | ||
4788 | } | ||
4789 | #endif | ||
4790 | |||
4791 | return NULL; | ||
4515 | } | 4792 | } |
4516 | 4793 | ||
4517 | /* | 4794 | /* |
@@ -4783,7 +5060,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
4783 | { | 5060 | { |
4784 | int src_nid, dst_nid; | 5061 | int src_nid, dst_nid; |
4785 | 5062 | ||
4786 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | 5063 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || |
4787 | !(env->sd->flags & SD_NUMA)) { | 5064 | !(env->sd->flags & SD_NUMA)) { |
4788 | return false; | 5065 | return false; |
4789 | } | 5066 | } |
@@ -4814,7 +5091,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
4814 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5091 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
4815 | return false; | 5092 | return false; |
4816 | 5093 | ||
4817 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | 5094 | if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) |
4818 | return false; | 5095 | return false; |
4819 | 5096 | ||
4820 | src_nid = cpu_to_node(env->src_cpu); | 5097 | src_nid = cpu_to_node(env->src_cpu); |
@@ -6357,17 +6634,16 @@ out: | |||
6357 | * idle_balance is called by schedule() if this_cpu is about to become | 6634 | * idle_balance is called by schedule() if this_cpu is about to become |
6358 | * idle. Attempts to pull tasks from other CPUs. | 6635 | * idle. Attempts to pull tasks from other CPUs. |
6359 | */ | 6636 | */ |
6360 | void idle_balance(int this_cpu, struct rq *this_rq) | 6637 | int idle_balance(struct rq *this_rq) |
6361 | { | 6638 | { |
6362 | struct sched_domain *sd; | 6639 | struct sched_domain *sd; |
6363 | int pulled_task = 0; | 6640 | int pulled_task = 0; |
6364 | unsigned long next_balance = jiffies + HZ; | 6641 | unsigned long next_balance = jiffies + HZ; |
6365 | u64 curr_cost = 0; | 6642 | u64 curr_cost = 0; |
6366 | 6643 | int this_cpu = this_rq->cpu; | |
6367 | this_rq->idle_stamp = rq_clock(this_rq); | ||
6368 | 6644 | ||
6369 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 6645 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
6370 | return; | 6646 | return 0; |
6371 | 6647 | ||
6372 | /* | 6648 | /* |
6373 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 6649 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
@@ -6405,15 +6681,20 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
6405 | interval = msecs_to_jiffies(sd->balance_interval); | 6681 | interval = msecs_to_jiffies(sd->balance_interval); |
6406 | if (time_after(next_balance, sd->last_balance + interval)) | 6682 | if (time_after(next_balance, sd->last_balance + interval)) |
6407 | next_balance = sd->last_balance + interval; | 6683 | next_balance = sd->last_balance + interval; |
6408 | if (pulled_task) { | 6684 | if (pulled_task) |
6409 | this_rq->idle_stamp = 0; | ||
6410 | break; | 6685 | break; |
6411 | } | ||
6412 | } | 6686 | } |
6413 | rcu_read_unlock(); | 6687 | rcu_read_unlock(); |
6414 | 6688 | ||
6415 | raw_spin_lock(&this_rq->lock); | 6689 | raw_spin_lock(&this_rq->lock); |
6416 | 6690 | ||
6691 | /* | ||
6692 | * While browsing the domains, we released the rq lock. | ||
6693 | * A task could have be enqueued in the meantime | ||
6694 | */ | ||
6695 | if (this_rq->nr_running && !pulled_task) | ||
6696 | return 1; | ||
6697 | |||
6417 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 6698 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
6418 | /* | 6699 | /* |
6419 | * We are going idle. next_balance may be set based on | 6700 | * We are going idle. next_balance may be set based on |
@@ -6424,6 +6705,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
6424 | 6705 | ||
6425 | if (curr_cost > this_rq->max_idle_balance_cost) | 6706 | if (curr_cost > this_rq->max_idle_balance_cost) |
6426 | this_rq->max_idle_balance_cost = curr_cost; | 6707 | this_rq->max_idle_balance_cost = curr_cost; |
6708 | |||
6709 | return pulled_task; | ||
6427 | } | 6710 | } |
6428 | 6711 | ||
6429 | /* | 6712 | /* |
@@ -7082,7 +7365,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
7082 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7365 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7083 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 7366 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
7084 | { | 7367 | { |
7368 | struct sched_entity *se = &p->se; | ||
7085 | struct cfs_rq *cfs_rq; | 7369 | struct cfs_rq *cfs_rq; |
7370 | |||
7086 | /* | 7371 | /* |
7087 | * If the task was not on the rq at the time of this cgroup movement | 7372 | * If the task was not on the rq at the time of this cgroup movement |
7088 | * it must have been asleep, sleeping tasks keep their ->vruntime | 7373 | * it must have been asleep, sleeping tasks keep their ->vruntime |
@@ -7108,23 +7393,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
7108 | * To prevent boost or penalty in the new cfs_rq caused by delta | 7393 | * To prevent boost or penalty in the new cfs_rq caused by delta |
7109 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | 7394 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. |
7110 | */ | 7395 | */ |
7111 | if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) | 7396 | if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) |
7112 | on_rq = 1; | 7397 | on_rq = 1; |
7113 | 7398 | ||
7114 | if (!on_rq) | 7399 | if (!on_rq) |
7115 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 7400 | se->vruntime -= cfs_rq_of(se)->min_vruntime; |
7116 | set_task_rq(p, task_cpu(p)); | 7401 | set_task_rq(p, task_cpu(p)); |
7402 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
7117 | if (!on_rq) { | 7403 | if (!on_rq) { |
7118 | cfs_rq = cfs_rq_of(&p->se); | 7404 | cfs_rq = cfs_rq_of(se); |
7119 | p->se.vruntime += cfs_rq->min_vruntime; | 7405 | se->vruntime += cfs_rq->min_vruntime; |
7120 | #ifdef CONFIG_SMP | 7406 | #ifdef CONFIG_SMP |
7121 | /* | 7407 | /* |
7122 | * migrate_task_rq_fair() will have removed our previous | 7408 | * migrate_task_rq_fair() will have removed our previous |
7123 | * contribution, but we must synchronize for ongoing future | 7409 | * contribution, but we must synchronize for ongoing future |
7124 | * decay. | 7410 | * decay. |
7125 | */ | 7411 | */ |
7126 | p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 7412 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
7127 | cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; | 7413 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
7128 | #endif | 7414 | #endif |
7129 | } | 7415 | } |
7130 | } | 7416 | } |
@@ -7220,10 +7506,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7220 | if (!se) | 7506 | if (!se) |
7221 | return; | 7507 | return; |
7222 | 7508 | ||
7223 | if (!parent) | 7509 | if (!parent) { |
7224 | se->cfs_rq = &rq->cfs; | 7510 | se->cfs_rq = &rq->cfs; |
7225 | else | 7511 | se->depth = 0; |
7512 | } else { | ||
7226 | se->cfs_rq = parent->my_q; | 7513 | se->cfs_rq = parent->my_q; |
7514 | se->depth = parent->depth + 1; | ||
7515 | } | ||
7227 | 7516 | ||
7228 | se->my_q = cfs_rq; | 7517 | se->my_q = cfs_rq; |
7229 | /* guarantee group entities always have weight */ | 7518 | /* guarantee group entities always have weight */ |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 000000000000..14ca43430aee --- /dev/null +++ b/kernel/sched/idle.c | |||
@@ -0,0 +1,144 @@ | |||
1 | /* | ||
2 | * Generic entry point for the idle threads | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/cpu.h> | ||
6 | #include <linux/cpuidle.h> | ||
7 | #include <linux/tick.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/stackprotector.h> | ||
10 | |||
11 | #include <asm/tlb.h> | ||
12 | |||
13 | #include <trace/events/power.h> | ||
14 | |||
15 | static int __read_mostly cpu_idle_force_poll; | ||
16 | |||
17 | void cpu_idle_poll_ctrl(bool enable) | ||
18 | { | ||
19 | if (enable) { | ||
20 | cpu_idle_force_poll++; | ||
21 | } else { | ||
22 | cpu_idle_force_poll--; | ||
23 | WARN_ON_ONCE(cpu_idle_force_poll < 0); | ||
24 | } | ||
25 | } | ||
26 | |||
27 | #ifdef CONFIG_GENERIC_IDLE_POLL_SETUP | ||
28 | static int __init cpu_idle_poll_setup(char *__unused) | ||
29 | { | ||
30 | cpu_idle_force_poll = 1; | ||
31 | return 1; | ||
32 | } | ||
33 | __setup("nohlt", cpu_idle_poll_setup); | ||
34 | |||
35 | static int __init cpu_idle_nopoll_setup(char *__unused) | ||
36 | { | ||
37 | cpu_idle_force_poll = 0; | ||
38 | return 1; | ||
39 | } | ||
40 | __setup("hlt", cpu_idle_nopoll_setup); | ||
41 | #endif | ||
42 | |||
43 | static inline int cpu_idle_poll(void) | ||
44 | { | ||
45 | rcu_idle_enter(); | ||
46 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | ||
47 | local_irq_enable(); | ||
48 | while (!tif_need_resched()) | ||
49 | cpu_relax(); | ||
50 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | ||
51 | rcu_idle_exit(); | ||
52 | return 1; | ||
53 | } | ||
54 | |||
55 | /* Weak implementations for optional arch specific functions */ | ||
56 | void __weak arch_cpu_idle_prepare(void) { } | ||
57 | void __weak arch_cpu_idle_enter(void) { } | ||
58 | void __weak arch_cpu_idle_exit(void) { } | ||
59 | void __weak arch_cpu_idle_dead(void) { } | ||
60 | void __weak arch_cpu_idle(void) | ||
61 | { | ||
62 | cpu_idle_force_poll = 1; | ||
63 | local_irq_enable(); | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * Generic idle loop implementation | ||
68 | */ | ||
69 | static void cpu_idle_loop(void) | ||
70 | { | ||
71 | while (1) { | ||
72 | tick_nohz_idle_enter(); | ||
73 | |||
74 | while (!need_resched()) { | ||
75 | check_pgt_cache(); | ||
76 | rmb(); | ||
77 | |||
78 | if (cpu_is_offline(smp_processor_id())) | ||
79 | arch_cpu_idle_dead(); | ||
80 | |||
81 | local_irq_disable(); | ||
82 | arch_cpu_idle_enter(); | ||
83 | |||
84 | /* | ||
85 | * In poll mode we reenable interrupts and spin. | ||
86 | * | ||
87 | * Also if we detected in the wakeup from idle | ||
88 | * path that the tick broadcast device expired | ||
89 | * for us, we don't want to go deep idle as we | ||
90 | * know that the IPI is going to arrive right | ||
91 | * away | ||
92 | */ | ||
93 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { | ||
94 | cpu_idle_poll(); | ||
95 | } else { | ||
96 | if (!current_clr_polling_and_test()) { | ||
97 | stop_critical_timings(); | ||
98 | rcu_idle_enter(); | ||
99 | if (cpuidle_idle_call()) | ||
100 | arch_cpu_idle(); | ||
101 | if (WARN_ON_ONCE(irqs_disabled())) | ||
102 | local_irq_enable(); | ||
103 | rcu_idle_exit(); | ||
104 | start_critical_timings(); | ||
105 | } else { | ||
106 | local_irq_enable(); | ||
107 | } | ||
108 | __current_set_polling(); | ||
109 | } | ||
110 | arch_cpu_idle_exit(); | ||
111 | /* | ||
112 | * We need to test and propagate the TIF_NEED_RESCHED | ||
113 | * bit here because we might not have send the | ||
114 | * reschedule IPI to idle tasks. | ||
115 | */ | ||
116 | if (tif_need_resched()) | ||
117 | set_preempt_need_resched(); | ||
118 | } | ||
119 | tick_nohz_idle_exit(); | ||
120 | schedule_preempt_disabled(); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | void cpu_startup_entry(enum cpuhp_state state) | ||
125 | { | ||
126 | /* | ||
127 | * This #ifdef needs to die, but it's too late in the cycle to | ||
128 | * make this generic (arm and sh have never invoked the canary | ||
129 | * init for the non boot cpus!). Will be fixed in 3.11 | ||
130 | */ | ||
131 | #ifdef CONFIG_X86 | ||
132 | /* | ||
133 | * If we're the non-boot CPU, nothing set the stack canary up | ||
134 | * for us. The boot CPU already has it initialized but no harm | ||
135 | * in doing it again. This is a good place for updating it, as | ||
136 | * we wont ever return from this function (so the invalid | ||
137 | * canaries already on the stack wont ever trigger). | ||
138 | */ | ||
139 | boot_init_stack_canary(); | ||
140 | #endif | ||
141 | __current_set_polling(); | ||
142 | arch_cpu_idle_prepare(); | ||
143 | cpu_idle_loop(); | ||
144 | } | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9ceea1..f7d03af79a5b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
13 | { | 13 | { |
14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
15 | } | 15 | } |
16 | |||
17 | static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) | ||
18 | { | ||
19 | idle_exit_fair(rq); | ||
20 | rq_last_tick_reset(rq); | ||
21 | } | ||
22 | |||
23 | static void post_schedule_idle(struct rq *rq) | ||
24 | { | ||
25 | idle_enter_fair(rq); | ||
26 | } | ||
27 | #endif /* CONFIG_SMP */ | 16 | #endif /* CONFIG_SMP */ |
17 | |||
28 | /* | 18 | /* |
29 | * Idle tasks are unconditionally rescheduled: | 19 | * Idle tasks are unconditionally rescheduled: |
30 | */ | 20 | */ |
@@ -33,12 +23,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
33 | resched_task(rq->idle); | 23 | resched_task(rq->idle); |
34 | } | 24 | } |
35 | 25 | ||
36 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 26 | static struct task_struct * |
27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev) | ||
37 | { | 28 | { |
29 | if (prev) | ||
30 | prev->sched_class->put_prev_task(rq, prev); | ||
31 | |||
38 | schedstat_inc(rq, sched_goidle); | 32 | schedstat_inc(rq, sched_goidle); |
39 | #ifdef CONFIG_SMP | 33 | #ifdef CONFIG_SMP |
40 | /* Trigger the post schedule to do an idle_enter for CFS */ | 34 | idle_enter_fair(rq); |
41 | rq->post_schedule = 1; | ||
42 | #endif | 35 | #endif |
43 | return rq->idle; | 36 | return rq->idle; |
44 | } | 37 | } |
@@ -58,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | |||
58 | 51 | ||
59 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | 52 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) |
60 | { | 53 | { |
54 | #ifdef CONFIG_SMP | ||
55 | idle_exit_fair(rq); | ||
56 | rq_last_tick_reset(rq); | ||
57 | #endif | ||
61 | } | 58 | } |
62 | 59 | ||
63 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | 60 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
@@ -101,8 +98,6 @@ const struct sched_class idle_sched_class = { | |||
101 | 98 | ||
102 | #ifdef CONFIG_SMP | 99 | #ifdef CONFIG_SMP |
103 | .select_task_rq = select_task_rq_idle, | 100 | .select_task_rq = select_task_rq_idle, |
104 | .pre_schedule = pre_schedule_idle, | ||
105 | .post_schedule = post_schedule_idle, | ||
106 | #endif | 101 | #endif |
107 | 102 | ||
108 | .set_curr_task = set_curr_task_idle, | 103 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b45..72f9ec759972 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
229 | 229 | ||
230 | #ifdef CONFIG_SMP | 230 | #ifdef CONFIG_SMP |
231 | 231 | ||
232 | static int pull_rt_task(struct rq *this_rq); | ||
233 | |||
232 | static inline int rt_overloaded(struct rq *rq) | 234 | static inline int rt_overloaded(struct rq *rq) |
233 | { | 235 | { |
234 | return atomic_read(&rq->rd->rto_count); | 236 | return atomic_read(&rq->rd->rto_count); |
@@ -1310,15 +1312,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1310 | { | 1312 | { |
1311 | struct sched_rt_entity *rt_se; | 1313 | struct sched_rt_entity *rt_se; |
1312 | struct task_struct *p; | 1314 | struct task_struct *p; |
1313 | struct rt_rq *rt_rq; | 1315 | struct rt_rq *rt_rq = &rq->rt; |
1314 | |||
1315 | rt_rq = &rq->rt; | ||
1316 | |||
1317 | if (!rt_rq->rt_nr_running) | ||
1318 | return NULL; | ||
1319 | |||
1320 | if (rt_rq_throttled(rt_rq)) | ||
1321 | return NULL; | ||
1322 | 1316 | ||
1323 | do { | 1317 | do { |
1324 | rt_se = pick_next_rt_entity(rq, rt_rq); | 1318 | rt_se = pick_next_rt_entity(rq, rt_rq); |
@@ -1332,9 +1326,28 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1332 | return p; | 1326 | return p; |
1333 | } | 1327 | } |
1334 | 1328 | ||
1335 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 1329 | static struct task_struct * |
1330 | pick_next_task_rt(struct rq *rq, struct task_struct *prev) | ||
1336 | { | 1331 | { |
1337 | struct task_struct *p = _pick_next_task_rt(rq); | 1332 | struct task_struct *p; |
1333 | struct rt_rq *rt_rq = &rq->rt; | ||
1334 | |||
1335 | #ifdef CONFIG_SMP | ||
1336 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
1337 | if (rq->rt.highest_prio.curr > prev->prio) | ||
1338 | pull_rt_task(rq); | ||
1339 | #endif | ||
1340 | |||
1341 | if (!rt_rq->rt_nr_running) | ||
1342 | return NULL; | ||
1343 | |||
1344 | if (rt_rq_throttled(rt_rq)) | ||
1345 | return NULL; | ||
1346 | |||
1347 | if (prev) | ||
1348 | prev->sched_class->put_prev_task(rq, prev); | ||
1349 | |||
1350 | p = _pick_next_task_rt(rq); | ||
1338 | 1351 | ||
1339 | /* The running task is never eligible for pushing */ | 1352 | /* The running task is never eligible for pushing */ |
1340 | if (p) | 1353 | if (p) |
@@ -1716,13 +1729,6 @@ skip: | |||
1716 | return ret; | 1729 | return ret; |
1717 | } | 1730 | } |
1718 | 1731 | ||
1719 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
1720 | { | ||
1721 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
1722 | if (rq->rt.highest_prio.curr > prev->prio) | ||
1723 | pull_rt_task(rq); | ||
1724 | } | ||
1725 | |||
1726 | static void post_schedule_rt(struct rq *rq) | 1732 | static void post_schedule_rt(struct rq *rq) |
1727 | { | 1733 | { |
1728 | push_rt_tasks(rq); | 1734 | push_rt_tasks(rq); |
@@ -1999,7 +2005,6 @@ const struct sched_class rt_sched_class = { | |||
1999 | .set_cpus_allowed = set_cpus_allowed_rt, | 2005 | .set_cpus_allowed = set_cpus_allowed_rt, |
2000 | .rq_online = rq_online_rt, | 2006 | .rq_online = rq_online_rt, |
2001 | .rq_offline = rq_offline_rt, | 2007 | .rq_offline = rq_offline_rt, |
2002 | .pre_schedule = pre_schedule_rt, | ||
2003 | .post_schedule = post_schedule_rt, | 2008 | .post_schedule = post_schedule_rt, |
2004 | .task_woken = task_woken_rt, | 2009 | .task_woken = task_woken_rt, |
2005 | .switched_from = switched_from_rt, | 2010 | .switched_from = switched_from_rt, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8b..1bf34c257d3b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq); | |||
24 | extern void update_cpu_load_active(struct rq *this_rq); | 24 | extern void update_cpu_load_active(struct rq *this_rq); |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
28 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
29 | * and back. | ||
30 | */ | ||
31 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
32 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
33 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
34 | |||
35 | /* | ||
36 | * 'User priority' is the nice value converted to something we | ||
37 | * can work with better when scaling various scheduler parameters, | ||
38 | * it's a [ 0 ... 39 ] range. | ||
39 | */ | ||
40 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
41 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
42 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
43 | |||
44 | /* | ||
45 | * Helpers for converting nanosecond timing to jiffy resolution | 27 | * Helpers for converting nanosecond timing to jiffy resolution |
46 | */ | 28 | */ |
47 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 29 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
@@ -1123,14 +1105,19 @@ struct sched_class { | |||
1123 | 1105 | ||
1124 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1106 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); |
1125 | 1107 | ||
1126 | struct task_struct * (*pick_next_task) (struct rq *rq); | 1108 | /* |
1109 | * It is the responsibility of the pick_next_task() method that will | ||
1110 | * return the next task to call put_prev_task() on the @prev task or | ||
1111 | * something equivalent. | ||
1112 | */ | ||
1113 | struct task_struct * (*pick_next_task) (struct rq *rq, | ||
1114 | struct task_struct *prev); | ||
1127 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1115 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
1128 | 1116 | ||
1129 | #ifdef CONFIG_SMP | 1117 | #ifdef CONFIG_SMP |
1130 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1118 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
1131 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1119 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); |
1132 | 1120 | ||
1133 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | ||
1134 | void (*post_schedule) (struct rq *this_rq); | 1121 | void (*post_schedule) (struct rq *this_rq); |
1135 | void (*task_waking) (struct task_struct *task); | 1122 | void (*task_waking) (struct task_struct *task); |
1136 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1123 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
@@ -1176,7 +1163,7 @@ extern const struct sched_class idle_sched_class; | |||
1176 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1163 | extern void update_group_power(struct sched_domain *sd, int cpu); |
1177 | 1164 | ||
1178 | extern void trigger_load_balance(struct rq *rq); | 1165 | extern void trigger_load_balance(struct rq *rq); |
1179 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1166 | extern int idle_balance(struct rq *this_rq); |
1180 | 1167 | ||
1181 | extern void idle_enter_fair(struct rq *this_rq); | 1168 | extern void idle_enter_fair(struct rq *this_rq); |
1182 | extern void idle_exit_fair(struct rq *this_rq); | 1169 | extern void idle_exit_fair(struct rq *this_rq); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index fdb6bb0b3356..a4147c9d2017 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | |||
23 | /* we're never preempted */ | 23 | /* we're never preempted */ |
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct *pick_next_task_stop(struct rq *rq) | 26 | static struct task_struct * |
27 | pick_next_task_stop(struct rq *rq, struct task_struct *prev) | ||
27 | { | 28 | { |
28 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
29 | 30 | ||
30 | if (stop && stop->on_rq) { | 31 | if (!stop || !stop->on_rq) |
31 | stop->se.exec_start = rq_clock_task(rq); | 32 | return NULL; |
32 | return stop; | ||
33 | } | ||
34 | 33 | ||
35 | return NULL; | 34 | if (prev) |
35 | prev->sched_class->put_prev_task(rq, prev); | ||
36 | |||
37 | stop->se.exec_start = rq_clock_task(rq); | ||
38 | |||
39 | return stop; | ||
36 | } | 40 | } |
37 | 41 | ||
38 | static void | 42 | static void |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 49e13e1f8fe6..7754ff16f334 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = { | |||
386 | .proc_handler = proc_dointvec, | 386 | .proc_handler = proc_dointvec, |
387 | }, | 387 | }, |
388 | { | 388 | { |
389 | .procname = "numa_balancing_migrate_deferred", | ||
390 | .data = &sysctl_numa_balancing_migrate_deferred, | ||
391 | .maxlen = sizeof(unsigned int), | ||
392 | .mode = 0644, | ||
393 | .proc_handler = proc_dointvec, | ||
394 | }, | ||
395 | { | ||
396 | .procname = "numa_balancing", | 389 | .procname = "numa_balancing", |
397 | .data = NULL, /* filled in by handler */ | 390 | .data = NULL, /* filled in by handler */ |
398 | .maxlen = sizeof(unsigned int), | 391 | .maxlen = sizeof(unsigned int), |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ae3c8f3595d4..f520b9da9c1f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n) | |||
2301 | kmem_cache_free(sn_cache, n); | 2301 | kmem_cache_free(sn_cache, n); |
2302 | } | 2302 | } |
2303 | 2303 | ||
2304 | #ifdef CONFIG_NUMA_BALANCING | ||
2305 | static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2306 | { | ||
2307 | /* Never defer a private fault */ | ||
2308 | if (cpupid_match_pid(p, last_cpupid)) | ||
2309 | return false; | ||
2310 | |||
2311 | if (p->numa_migrate_deferred) { | ||
2312 | p->numa_migrate_deferred--; | ||
2313 | return true; | ||
2314 | } | ||
2315 | return false; | ||
2316 | } | ||
2317 | |||
2318 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2319 | { | ||
2320 | p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; | ||
2321 | } | ||
2322 | #else | ||
2323 | static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2324 | { | ||
2325 | return false; | ||
2326 | } | ||
2327 | |||
2328 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2329 | { | ||
2330 | } | ||
2331 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2332 | |||
2333 | /** | 2304 | /** |
2334 | * mpol_misplaced - check whether current page node is valid in policy | 2305 | * mpol_misplaced - check whether current page node is valid in policy |
2335 | * | 2306 | * |
@@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2403 | 2374 | ||
2404 | /* Migrate the page towards the node whose CPU is referencing it */ | 2375 | /* Migrate the page towards the node whose CPU is referencing it */ |
2405 | if (pol->flags & MPOL_F_MORON) { | 2376 | if (pol->flags & MPOL_F_MORON) { |
2406 | int last_cpupid; | ||
2407 | int this_cpupid; | ||
2408 | |||
2409 | polnid = thisnid; | 2377 | polnid = thisnid; |
2410 | this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); | ||
2411 | |||
2412 | /* | ||
2413 | * Multi-stage node selection is used in conjunction | ||
2414 | * with a periodic migration fault to build a temporal | ||
2415 | * task<->page relation. By using a two-stage filter we | ||
2416 | * remove short/unlikely relations. | ||
2417 | * | ||
2418 | * Using P(p) ~ n_p / n_t as per frequentist | ||
2419 | * probability, we can equate a task's usage of a | ||
2420 | * particular page (n_p) per total usage of this | ||
2421 | * page (n_t) (in a given time-span) to a probability. | ||
2422 | * | ||
2423 | * Our periodic faults will sample this probability and | ||
2424 | * getting the same result twice in a row, given these | ||
2425 | * samples are fully independent, is then given by | ||
2426 | * P(n)^2, provided our sample period is sufficiently | ||
2427 | * short compared to the usage pattern. | ||
2428 | * | ||
2429 | * This quadric squishes small probabilities, making | ||
2430 | * it less likely we act on an unlikely task<->page | ||
2431 | * relation. | ||
2432 | */ | ||
2433 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); | ||
2434 | if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { | ||
2435 | 2378 | ||
2436 | /* See sysctl_numa_balancing_migrate_deferred comment */ | 2379 | if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) |
2437 | if (!cpupid_match_pid(current, last_cpupid)) | ||
2438 | defer_numa_migrate(current); | ||
2439 | |||
2440 | goto out; | ||
2441 | } | ||
2442 | |||
2443 | /* | ||
2444 | * The quadratic filter above reduces extraneous migration | ||
2445 | * of shared pages somewhat. This code reduces it even more, | ||
2446 | * reducing the overhead of page migrations of shared pages. | ||
2447 | * This makes workloads with shared pages rely more on | ||
2448 | * "move task near its memory", and less on "move memory | ||
2449 | * towards its task", which is exactly what we want. | ||
2450 | */ | ||
2451 | if (numa_migrate_deferred(current, last_cpupid)) | ||
2452 | goto out; | 2380 | goto out; |
2453 | } | 2381 | } |
2454 | 2382 | ||