diff options
| -rw-r--r-- | Documentation/scheduler/sched-domains.txt | 7 | ||||
| -rw-r--r-- | Documentation/scheduler/sched-rt-group.txt | 4 | ||||
| -rw-r--r-- | include/linux/sched.h | 59 | ||||
| -rw-r--r-- | kernel/Makefile | 5 | ||||
| -rw-r--r-- | kernel/cpu.c | 24 | ||||
| -rw-r--r-- | kernel/cpuset.c | 14 | ||||
| -rw-r--r-- | kernel/kthread.c | 1 | ||||
| -rw-r--r-- | kernel/sched.c | 723 | ||||
| -rw-r--r-- | kernel/sched_clock.c | 137 | ||||
| -rw-r--r-- | kernel/sched_cpupri.c | 174 | ||||
| -rw-r--r-- | kernel/sched_cpupri.h | 36 | ||||
| -rw-r--r-- | kernel/sched_debug.c | 64 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 413 | ||||
| -rw-r--r-- | kernel/sched_features.h | 7 | ||||
| -rw-r--r-- | kernel/sched_rt.c | 405 | ||||
| -rw-r--r-- | kernel/sched_stats.h | 42 | ||||
| -rw-r--r-- | kernel/sysctl.c | 8 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 2 |
18 files changed, 1555 insertions, 570 deletions
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt index a9e990ab980f..373ceacc367e 100644 --- a/Documentation/scheduler/sched-domains.txt +++ b/Documentation/scheduler/sched-domains.txt | |||
| @@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your | |||
| 61 | arch_init_sched_domains function. This function will attach domains to all | 61 | arch_init_sched_domains function. This function will attach domains to all |
| 62 | CPUs using cpu_attach_domain. | 62 | CPUs using cpu_attach_domain. |
| 63 | 63 | ||
| 64 | Implementors should change the line | 64 | The sched-domains debugging infrastructure can be enabled by enabling |
| 65 | #undef SCHED_DOMAIN_DEBUG | 65 | CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains |
| 66 | to | ||
| 67 | #define SCHED_DOMAIN_DEBUG | ||
| 68 | in kernel/sched.c as this enables an error checking parse of the sched domains | ||
| 69 | which should catch most possible errors (described above). It also prints out | 66 | which should catch most possible errors (described above). It also prints out |
| 70 | the domain structure in a visual format. | 67 | the domain structure in a visual format. |
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 14f901f639ee..3ef339f491e0 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
| @@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s = | |||
| 51 | 0.00015s. So this group can be scheduled with a period of 0.005s and a run time | 51 | 0.00015s. So this group can be scheduled with a period of 0.005s and a run time |
| 52 | of 0.00015s. | 52 | of 0.00015s. |
| 53 | 53 | ||
| 54 | The remaining CPU time will be used for user input and other tass. Because | 54 | The remaining CPU time will be used for user input and other tasks. Because |
| 55 | realtime tasks have explicitly allocated the CPU time they need to perform | 55 | realtime tasks have explicitly allocated the CPU time they need to perform |
| 56 | their tasks, buffer underruns in the graphocs or audio can be eliminated. | 56 | their tasks, buffer underruns in the graphics or audio can be eliminated. |
| 57 | 57 | ||
| 58 | NOTE: the above example is not fully implemented as of yet (2.6.25). We still | 58 | NOTE: the above example is not fully implemented as of yet (2.6.25). We still |
| 59 | lack an EDF scheduler to make non-uniform periods usable. | 59 | lack an EDF scheduler to make non-uniform periods usable. |
diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8d..f6cd60f2de63 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -134,7 +134,6 @@ extern unsigned long nr_running(void); | |||
| 134 | extern unsigned long nr_uninterruptible(void); | 134 | extern unsigned long nr_uninterruptible(void); |
| 135 | extern unsigned long nr_active(void); | 135 | extern unsigned long nr_active(void); |
| 136 | extern unsigned long nr_iowait(void); | 136 | extern unsigned long nr_iowait(void); |
| 137 | extern unsigned long weighted_cpuload(const int cpu); | ||
| 138 | 137 | ||
| 139 | struct seq_file; | 138 | struct seq_file; |
| 140 | struct cfs_rq; | 139 | struct cfs_rq; |
| @@ -784,6 +783,8 @@ struct sched_domain { | |||
| 784 | unsigned int balance_interval; /* initialise to 1. units in ms. */ | 783 | unsigned int balance_interval; /* initialise to 1. units in ms. */ |
| 785 | unsigned int nr_balance_failed; /* initialise to 0 */ | 784 | unsigned int nr_balance_failed; /* initialise to 0 */ |
| 786 | 785 | ||
| 786 | u64 last_update; | ||
| 787 | |||
| 787 | #ifdef CONFIG_SCHEDSTATS | 788 | #ifdef CONFIG_SCHEDSTATS |
| 788 | /* load_balance() stats */ | 789 | /* load_balance() stats */ |
| 789 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; | 790 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; |
| @@ -823,23 +824,6 @@ extern int arch_reinit_sched_domains(void); | |||
| 823 | 824 | ||
| 824 | #endif /* CONFIG_SMP */ | 825 | #endif /* CONFIG_SMP */ |
| 825 | 826 | ||
| 826 | /* | ||
| 827 | * A runqueue laden with a single nice 0 task scores a weighted_cpuload of | ||
| 828 | * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a | ||
| 829 | * task of nice 0 or enough lower priority tasks to bring up the | ||
| 830 | * weighted_cpuload | ||
| 831 | */ | ||
| 832 | static inline int above_background_load(void) | ||
| 833 | { | ||
| 834 | unsigned long cpu; | ||
| 835 | |||
| 836 | for_each_online_cpu(cpu) { | ||
| 837 | if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) | ||
| 838 | return 1; | ||
| 839 | } | ||
| 840 | return 0; | ||
| 841 | } | ||
| 842 | |||
| 843 | struct io_context; /* See blkdev.h */ | 827 | struct io_context; /* See blkdev.h */ |
| 844 | #define NGROUPS_SMALL 32 | 828 | #define NGROUPS_SMALL 32 |
| 845 | #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) | 829 | #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) |
| @@ -921,8 +905,8 @@ struct sched_class { | |||
| 921 | void (*set_cpus_allowed)(struct task_struct *p, | 905 | void (*set_cpus_allowed)(struct task_struct *p, |
| 922 | const cpumask_t *newmask); | 906 | const cpumask_t *newmask); |
| 923 | 907 | ||
| 924 | void (*join_domain)(struct rq *rq); | 908 | void (*rq_online)(struct rq *rq); |
| 925 | void (*leave_domain)(struct rq *rq); | 909 | void (*rq_offline)(struct rq *rq); |
| 926 | 910 | ||
| 927 | void (*switched_from) (struct rq *this_rq, struct task_struct *task, | 911 | void (*switched_from) (struct rq *this_rq, struct task_struct *task, |
| 928 | int running); | 912 | int running); |
| @@ -1039,6 +1023,7 @@ struct task_struct { | |||
| 1039 | #endif | 1023 | #endif |
| 1040 | 1024 | ||
| 1041 | int prio, static_prio, normal_prio; | 1025 | int prio, static_prio, normal_prio; |
| 1026 | unsigned int rt_priority; | ||
| 1042 | const struct sched_class *sched_class; | 1027 | const struct sched_class *sched_class; |
| 1043 | struct sched_entity se; | 1028 | struct sched_entity se; |
| 1044 | struct sched_rt_entity rt; | 1029 | struct sched_rt_entity rt; |
| @@ -1122,7 +1107,6 @@ struct task_struct { | |||
| 1122 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1107 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
| 1123 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ | 1108 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
| 1124 | 1109 | ||
| 1125 | unsigned int rt_priority; | ||
| 1126 | cputime_t utime, stime, utimescaled, stimescaled; | 1110 | cputime_t utime, stime, utimescaled, stimescaled; |
| 1127 | cputime_t gtime; | 1111 | cputime_t gtime; |
| 1128 | cputime_t prev_utime, prev_stime; | 1112 | cputime_t prev_utime, prev_stime; |
| @@ -1141,12 +1125,12 @@ struct task_struct { | |||
| 1141 | gid_t gid,egid,sgid,fsgid; | 1125 | gid_t gid,egid,sgid,fsgid; |
| 1142 | struct group_info *group_info; | 1126 | struct group_info *group_info; |
| 1143 | kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; | 1127 | kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; |
| 1144 | unsigned securebits; | ||
| 1145 | struct user_struct *user; | 1128 | struct user_struct *user; |
| 1129 | unsigned securebits; | ||
| 1146 | #ifdef CONFIG_KEYS | 1130 | #ifdef CONFIG_KEYS |
| 1131 | unsigned char jit_keyring; /* default keyring to attach requested keys to */ | ||
| 1147 | struct key *request_key_auth; /* assumed request_key authority */ | 1132 | struct key *request_key_auth; /* assumed request_key authority */ |
| 1148 | struct key *thread_keyring; /* keyring private to this thread */ | 1133 | struct key *thread_keyring; /* keyring private to this thread */ |
| 1149 | unsigned char jit_keyring; /* default keyring to attach requested keys to */ | ||
| 1150 | #endif | 1134 | #endif |
| 1151 | char comm[TASK_COMM_LEN]; /* executable name excluding path | 1135 | char comm[TASK_COMM_LEN]; /* executable name excluding path |
| 1152 | - access with [gs]et_task_comm (which lock | 1136 | - access with [gs]et_task_comm (which lock |
| @@ -1233,8 +1217,8 @@ struct task_struct { | |||
| 1233 | # define MAX_LOCK_DEPTH 48UL | 1217 | # define MAX_LOCK_DEPTH 48UL |
| 1234 | u64 curr_chain_key; | 1218 | u64 curr_chain_key; |
| 1235 | int lockdep_depth; | 1219 | int lockdep_depth; |
| 1236 | struct held_lock held_locks[MAX_LOCK_DEPTH]; | ||
| 1237 | unsigned int lockdep_recursion; | 1220 | unsigned int lockdep_recursion; |
| 1221 | struct held_lock held_locks[MAX_LOCK_DEPTH]; | ||
| 1238 | #endif | 1222 | #endif |
| 1239 | 1223 | ||
| 1240 | /* journalling filesystem info */ | 1224 | /* journalling filesystem info */ |
| @@ -1262,10 +1246,6 @@ struct task_struct { | |||
| 1262 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ | 1246 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ |
| 1263 | cputime_t acct_stimexpd;/* stime since last update */ | 1247 | cputime_t acct_stimexpd;/* stime since last update */ |
| 1264 | #endif | 1248 | #endif |
| 1265 | #ifdef CONFIG_NUMA | ||
| 1266 | struct mempolicy *mempolicy; | ||
| 1267 | short il_next; | ||
| 1268 | #endif | ||
| 1269 | #ifdef CONFIG_CPUSETS | 1249 | #ifdef CONFIG_CPUSETS |
| 1270 | nodemask_t mems_allowed; | 1250 | nodemask_t mems_allowed; |
| 1271 | int cpuset_mems_generation; | 1251 | int cpuset_mems_generation; |
| @@ -1285,6 +1265,10 @@ struct task_struct { | |||
| 1285 | struct list_head pi_state_list; | 1265 | struct list_head pi_state_list; |
| 1286 | struct futex_pi_state *pi_state_cache; | 1266 | struct futex_pi_state *pi_state_cache; |
| 1287 | #endif | 1267 | #endif |
| 1268 | #ifdef CONFIG_NUMA | ||
| 1269 | struct mempolicy *mempolicy; | ||
| 1270 | short il_next; | ||
| 1271 | #endif | ||
| 1288 | atomic_t fs_excl; /* holding fs exclusive resources */ | 1272 | atomic_t fs_excl; /* holding fs exclusive resources */ |
| 1289 | struct rcu_head rcu; | 1273 | struct rcu_head rcu; |
| 1290 | 1274 | ||
| @@ -1504,6 +1488,7 @@ static inline void put_task_struct(struct task_struct *t) | |||
| 1504 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | 1488 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
| 1505 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ | 1489 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ |
| 1506 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | 1490 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
| 1491 | #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ | ||
| 1507 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | 1492 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
| 1508 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | 1493 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
| 1509 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ | 1494 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ |
| @@ -1573,13 +1558,28 @@ static inline void sched_clock_idle_sleep_event(void) | |||
| 1573 | static inline void sched_clock_idle_wakeup_event(u64 delta_ns) | 1558 | static inline void sched_clock_idle_wakeup_event(u64 delta_ns) |
| 1574 | { | 1559 | { |
| 1575 | } | 1560 | } |
| 1576 | #else | 1561 | |
| 1562 | #ifdef CONFIG_NO_HZ | ||
| 1563 | static inline void sched_clock_tick_stop(int cpu) | ||
| 1564 | { | ||
| 1565 | } | ||
| 1566 | |||
| 1567 | static inline void sched_clock_tick_start(int cpu) | ||
| 1568 | { | ||
| 1569 | } | ||
| 1570 | #endif | ||
| 1571 | |||
| 1572 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | ||
| 1577 | extern void sched_clock_init(void); | 1573 | extern void sched_clock_init(void); |
| 1578 | extern u64 sched_clock_cpu(int cpu); | 1574 | extern u64 sched_clock_cpu(int cpu); |
| 1579 | extern void sched_clock_tick(void); | 1575 | extern void sched_clock_tick(void); |
| 1580 | extern void sched_clock_idle_sleep_event(void); | 1576 | extern void sched_clock_idle_sleep_event(void); |
| 1581 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); | 1577 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); |
| 1578 | #ifdef CONFIG_NO_HZ | ||
| 1579 | extern void sched_clock_tick_stop(int cpu); | ||
| 1580 | extern void sched_clock_tick_start(int cpu); | ||
| 1582 | #endif | 1581 | #endif |
| 1582 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | ||
| 1583 | 1583 | ||
| 1584 | /* | 1584 | /* |
| 1585 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 1585 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
| @@ -1622,6 +1622,7 @@ extern unsigned int sysctl_sched_child_runs_first; | |||
| 1622 | extern unsigned int sysctl_sched_features; | 1622 | extern unsigned int sysctl_sched_features; |
| 1623 | extern unsigned int sysctl_sched_migration_cost; | 1623 | extern unsigned int sysctl_sched_migration_cost; |
| 1624 | extern unsigned int sysctl_sched_nr_migrate; | 1624 | extern unsigned int sysctl_sched_nr_migrate; |
| 1625 | extern unsigned int sysctl_sched_shares_ratelimit; | ||
| 1625 | 1626 | ||
| 1626 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1627 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
| 1627 | struct file *file, void __user *buffer, size_t *length, | 1628 | struct file *file, void __user *buffer, size_t *length, |
diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9d..6c55301112e0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | 5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ |
| 6 | exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
| 7 | sysctl.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o capability.o ptrace.o timer.o user.o \ |
| 8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
| @@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
| 27 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 27 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
| 28 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 28 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
| 29 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 29 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
| 30 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 30 | obj-$(CONFIG_SMP) += spinlock.o |
| 31 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 31 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
| 32 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | 32 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o |
| 33 | obj-$(CONFIG_UID16) += uid16.o | 33 | obj-$(CONFIG_UID16) += uid16.o |
| @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | |||
| 69 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 69 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
| 70 | obj-$(CONFIG_MARKERS) += marker.o | 70 | obj-$(CONFIG_MARKERS) += marker.o |
| 71 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | 71 | obj-$(CONFIG_LATENCYTOP) += latencytop.o |
| 72 | obj-$(CONFIG_SMP) += sched_cpupri.o | ||
| 72 | 73 | ||
| 73 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 74 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
| 74 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 75 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c722..b11f06dc149a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -15,6 +15,28 @@ | |||
| 15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
| 16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
| 17 | 17 | ||
| 18 | /* | ||
| 19 | * Represents all cpu's present in the system | ||
| 20 | * In systems capable of hotplug, this map could dynamically grow | ||
| 21 | * as new cpu's are detected in the system via any platform specific | ||
| 22 | * method, such as ACPI for e.g. | ||
| 23 | */ | ||
| 24 | cpumask_t cpu_present_map __read_mostly; | ||
| 25 | EXPORT_SYMBOL(cpu_present_map); | ||
| 26 | |||
| 27 | #ifndef CONFIG_SMP | ||
| 28 | |||
| 29 | /* | ||
| 30 | * Represents all cpu's that are currently online. | ||
| 31 | */ | ||
| 32 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
| 33 | EXPORT_SYMBOL(cpu_online_map); | ||
| 34 | |||
| 35 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
| 36 | EXPORT_SYMBOL(cpu_possible_map); | ||
| 37 | |||
| 38 | #else /* CONFIG_SMP */ | ||
| 39 | |||
| 18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ | 40 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
| 19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 41 | static DEFINE_MUTEX(cpu_add_remove_lock); |
| 20 | 42 | ||
| @@ -403,3 +425,5 @@ out: | |||
| 403 | cpu_maps_update_done(); | 425 | cpu_maps_update_done(); |
| 404 | } | 426 | } |
| 405 | #endif /* CONFIG_PM_SLEEP_SMP */ | 427 | #endif /* CONFIG_PM_SLEEP_SMP */ |
| 428 | |||
| 429 | #endif /* CONFIG_SMP */ | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 798b3ab054eb..459d601947a8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, | |||
| 1194 | 1194 | ||
| 1195 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1195 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
| 1196 | return -ENOSPC; | 1196 | return -ENOSPC; |
| 1197 | if (tsk->flags & PF_THREAD_BOUND) { | ||
| 1198 | cpumask_t mask; | ||
| 1199 | |||
| 1200 | mutex_lock(&callback_mutex); | ||
| 1201 | mask = cs->cpus_allowed; | ||
| 1202 | mutex_unlock(&callback_mutex); | ||
| 1203 | if (!cpus_equal(tsk->cpus_allowed, mask)) | ||
| 1204 | return -EINVAL; | ||
| 1205 | } | ||
| 1197 | 1206 | ||
| 1198 | return security_task_setscheduler(tsk, 0, NULL); | 1207 | return security_task_setscheduler(tsk, 0, NULL); |
| 1199 | } | 1208 | } |
| @@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
| 1207 | struct mm_struct *mm; | 1216 | struct mm_struct *mm; |
| 1208 | struct cpuset *cs = cgroup_cs(cont); | 1217 | struct cpuset *cs = cgroup_cs(cont); |
| 1209 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1218 | struct cpuset *oldcs = cgroup_cs(oldcont); |
| 1219 | int err; | ||
| 1210 | 1220 | ||
| 1211 | mutex_lock(&callback_mutex); | 1221 | mutex_lock(&callback_mutex); |
| 1212 | guarantee_online_cpus(cs, &cpus); | 1222 | guarantee_online_cpus(cs, &cpus); |
| 1213 | set_cpus_allowed_ptr(tsk, &cpus); | 1223 | err = set_cpus_allowed_ptr(tsk, &cpus); |
| 1214 | mutex_unlock(&callback_mutex); | 1224 | mutex_unlock(&callback_mutex); |
| 1225 | if (err) | ||
| 1226 | return; | ||
| 1215 | 1227 | ||
| 1216 | from = oldcs->mems_allowed; | 1228 | from = oldcs->mems_allowed; |
| 1217 | to = cs->mems_allowed; | 1229 | to = cs->mems_allowed; |
diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e1..97747cdd37c9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
| 180 | set_task_cpu(k, cpu); | 180 | set_task_cpu(k, cpu); |
| 181 | k->cpus_allowed = cpumask_of_cpu(cpu); | 181 | k->cpus_allowed = cpumask_of_cpu(cpu); |
| 182 | k->rt.nr_cpus_allowed = 1; | 182 | k->rt.nr_cpus_allowed = 1; |
| 183 | k->flags |= PF_THREAD_BOUND; | ||
| 183 | } | 184 | } |
| 184 | EXPORT_SYMBOL(kthread_bind); | 185 | EXPORT_SYMBOL(kthread_bind); |
| 185 | 186 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 8402944f715b..591d5e7f757a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -74,6 +74,8 @@ | |||
| 74 | #include <asm/tlb.h> | 74 | #include <asm/tlb.h> |
| 75 | #include <asm/irq_regs.h> | 75 | #include <asm/irq_regs.h> |
| 76 | 76 | ||
| 77 | #include "sched_cpupri.h" | ||
| 78 | |||
| 77 | /* | 79 | /* |
| 78 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 80 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
| 79 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 81 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
| @@ -289,15 +291,15 @@ struct task_group root_task_group; | |||
| 289 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 291 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| 290 | /* Default task group's cfs_rq on each cpu */ | 292 | /* Default task group's cfs_rq on each cpu */ |
| 291 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 293 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
| 292 | #endif | 294 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 293 | 295 | ||
| 294 | #ifdef CONFIG_RT_GROUP_SCHED | 296 | #ifdef CONFIG_RT_GROUP_SCHED |
| 295 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 297 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 296 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 298 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
| 297 | #endif | 299 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 298 | #else | 300 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
| 299 | #define root_task_group init_task_group | 301 | #define root_task_group init_task_group |
| 300 | #endif | 302 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 301 | 303 | ||
| 302 | /* task_group_lock serializes add/remove of task groups and also changes to | 304 | /* task_group_lock serializes add/remove of task groups and also changes to |
| 303 | * a task group's cpu shares. | 305 | * a task group's cpu shares. |
| @@ -307,9 +309,9 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
| 307 | #ifdef CONFIG_FAIR_GROUP_SCHED | 309 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 308 | #ifdef CONFIG_USER_SCHED | 310 | #ifdef CONFIG_USER_SCHED |
| 309 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 311 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
| 310 | #else | 312 | #else /* !CONFIG_USER_SCHED */ |
| 311 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 313 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
| 312 | #endif | 314 | #endif /* CONFIG_USER_SCHED */ |
| 313 | 315 | ||
| 314 | /* | 316 | /* |
| 315 | * A weight of 0 or 1 can cause arithmetics problems. | 317 | * A weight of 0 or 1 can cause arithmetics problems. |
| @@ -363,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
| 363 | #else | 365 | #else |
| 364 | 366 | ||
| 365 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 367 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
| 368 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 369 | { | ||
| 370 | return NULL; | ||
| 371 | } | ||
| 366 | 372 | ||
| 367 | #endif /* CONFIG_GROUP_SCHED */ | 373 | #endif /* CONFIG_GROUP_SCHED */ |
| 368 | 374 | ||
| @@ -373,6 +379,7 @@ struct cfs_rq { | |||
| 373 | 379 | ||
| 374 | u64 exec_clock; | 380 | u64 exec_clock; |
| 375 | u64 min_vruntime; | 381 | u64 min_vruntime; |
| 382 | u64 pair_start; | ||
| 376 | 383 | ||
| 377 | struct rb_root tasks_timeline; | 384 | struct rb_root tasks_timeline; |
| 378 | struct rb_node *rb_leftmost; | 385 | struct rb_node *rb_leftmost; |
| @@ -401,6 +408,31 @@ struct cfs_rq { | |||
| 401 | */ | 408 | */ |
| 402 | struct list_head leaf_cfs_rq_list; | 409 | struct list_head leaf_cfs_rq_list; |
| 403 | struct task_group *tg; /* group that "owns" this runqueue */ | 410 | struct task_group *tg; /* group that "owns" this runqueue */ |
| 411 | |||
| 412 | #ifdef CONFIG_SMP | ||
| 413 | /* | ||
| 414 | * the part of load.weight contributed by tasks | ||
| 415 | */ | ||
| 416 | unsigned long task_weight; | ||
| 417 | |||
| 418 | /* | ||
| 419 | * h_load = weight * f(tg) | ||
| 420 | * | ||
| 421 | * Where f(tg) is the recursive weight fraction assigned to | ||
| 422 | * this group. | ||
| 423 | */ | ||
| 424 | unsigned long h_load; | ||
| 425 | |||
| 426 | /* | ||
| 427 | * this cpu's part of tg->shares | ||
| 428 | */ | ||
| 429 | unsigned long shares; | ||
| 430 | |||
| 431 | /* | ||
| 432 | * load.weight at the time we set shares | ||
| 433 | */ | ||
| 434 | unsigned long rq_weight; | ||
| 435 | #endif | ||
| 404 | #endif | 436 | #endif |
| 405 | }; | 437 | }; |
| 406 | 438 | ||
| @@ -452,6 +484,9 @@ struct root_domain { | |||
| 452 | */ | 484 | */ |
| 453 | cpumask_t rto_mask; | 485 | cpumask_t rto_mask; |
| 454 | atomic_t rto_count; | 486 | atomic_t rto_count; |
| 487 | #ifdef CONFIG_SMP | ||
| 488 | struct cpupri cpupri; | ||
| 489 | #endif | ||
| 455 | }; | 490 | }; |
| 456 | 491 | ||
| 457 | /* | 492 | /* |
| @@ -526,6 +561,9 @@ struct rq { | |||
| 526 | int push_cpu; | 561 | int push_cpu; |
| 527 | /* cpu of this runqueue: */ | 562 | /* cpu of this runqueue: */ |
| 528 | int cpu; | 563 | int cpu; |
| 564 | int online; | ||
| 565 | |||
| 566 | unsigned long avg_load_per_task; | ||
| 529 | 567 | ||
| 530 | struct task_struct *migration_thread; | 568 | struct task_struct *migration_thread; |
| 531 | struct list_head migration_queue; | 569 | struct list_head migration_queue; |
| @@ -749,6 +787,12 @@ late_initcall(sched_init_debug); | |||
| 749 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 787 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
| 750 | 788 | ||
| 751 | /* | 789 | /* |
| 790 | * ratelimit for updating the group shares. | ||
| 791 | * default: 0.5ms | ||
| 792 | */ | ||
| 793 | const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; | ||
| 794 | |||
| 795 | /* | ||
| 752 | * period over which we measure -rt task cpu usage in us. | 796 | * period over which we measure -rt task cpu usage in us. |
| 753 | * default: 1s | 797 | * default: 1s |
| 754 | */ | 798 | */ |
| @@ -775,82 +819,6 @@ static inline u64 global_rt_runtime(void) | |||
| 775 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 819 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
| 776 | } | 820 | } |
| 777 | 821 | ||
| 778 | unsigned long long time_sync_thresh = 100000; | ||
| 779 | |||
| 780 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
| 781 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
| 782 | |||
| 783 | /* | ||
| 784 | * Global lock which we take every now and then to synchronize | ||
| 785 | * the CPUs time. This method is not warp-safe, but it's good | ||
| 786 | * enough to synchronize slowly diverging time sources and thus | ||
| 787 | * it's good enough for tracing: | ||
| 788 | */ | ||
| 789 | static DEFINE_SPINLOCK(time_sync_lock); | ||
| 790 | static unsigned long long prev_global_time; | ||
| 791 | |||
| 792 | static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) | ||
| 793 | { | ||
| 794 | /* | ||
| 795 | * We want this inlined, to not get tracer function calls | ||
| 796 | * in this critical section: | ||
| 797 | */ | ||
| 798 | spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); | ||
| 799 | __raw_spin_lock(&time_sync_lock.raw_lock); | ||
| 800 | |||
| 801 | if (time < prev_global_time) { | ||
| 802 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
| 803 | time = prev_global_time; | ||
| 804 | } else { | ||
| 805 | prev_global_time = time; | ||
| 806 | } | ||
| 807 | |||
| 808 | __raw_spin_unlock(&time_sync_lock.raw_lock); | ||
| 809 | spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); | ||
| 810 | |||
| 811 | return time; | ||
| 812 | } | ||
| 813 | |||
| 814 | static unsigned long long __cpu_clock(int cpu) | ||
| 815 | { | ||
| 816 | unsigned long long now; | ||
| 817 | |||
| 818 | /* | ||
| 819 | * Only call sched_clock() if the scheduler has already been | ||
| 820 | * initialized (some code might call cpu_clock() very early): | ||
| 821 | */ | ||
| 822 | if (unlikely(!scheduler_running)) | ||
| 823 | return 0; | ||
| 824 | |||
| 825 | now = sched_clock_cpu(cpu); | ||
| 826 | |||
| 827 | return now; | ||
| 828 | } | ||
| 829 | |||
| 830 | /* | ||
| 831 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
| 832 | * clock constructed from sched_clock(): | ||
| 833 | */ | ||
| 834 | unsigned long long cpu_clock(int cpu) | ||
| 835 | { | ||
| 836 | unsigned long long prev_cpu_time, time, delta_time; | ||
| 837 | unsigned long flags; | ||
| 838 | |||
| 839 | local_irq_save(flags); | ||
| 840 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
| 841 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
| 842 | delta_time = time-prev_cpu_time; | ||
| 843 | |||
| 844 | if (unlikely(delta_time > time_sync_thresh)) { | ||
| 845 | time = __sync_cpu_clock(time, cpu); | ||
| 846 | per_cpu(prev_cpu_time, cpu) = time; | ||
| 847 | } | ||
| 848 | local_irq_restore(flags); | ||
| 849 | |||
| 850 | return time; | ||
| 851 | } | ||
| 852 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
| 853 | |||
| 854 | #ifndef prepare_arch_switch | 822 | #ifndef prepare_arch_switch |
| 855 | # define prepare_arch_switch(next) do { } while (0) | 823 | # define prepare_arch_switch(next) do { } while (0) |
| 856 | #endif | 824 | #endif |
| @@ -1313,15 +1281,15 @@ void wake_up_idle_cpu(int cpu) | |||
| 1313 | if (!tsk_is_polling(rq->idle)) | 1281 | if (!tsk_is_polling(rq->idle)) |
| 1314 | smp_send_reschedule(cpu); | 1282 | smp_send_reschedule(cpu); |
| 1315 | } | 1283 | } |
| 1316 | #endif | 1284 | #endif /* CONFIG_NO_HZ */ |
| 1317 | 1285 | ||
| 1318 | #else | 1286 | #else /* !CONFIG_SMP */ |
| 1319 | static void __resched_task(struct task_struct *p, int tif_bit) | 1287 | static void __resched_task(struct task_struct *p, int tif_bit) |
| 1320 | { | 1288 | { |
| 1321 | assert_spin_locked(&task_rq(p)->lock); | 1289 | assert_spin_locked(&task_rq(p)->lock); |
| 1322 | set_tsk_thread_flag(p, tif_bit); | 1290 | set_tsk_thread_flag(p, tif_bit); |
| 1323 | } | 1291 | } |
| 1324 | #endif | 1292 | #endif /* CONFIG_SMP */ |
| 1325 | 1293 | ||
| 1326 | #if BITS_PER_LONG == 32 | 1294 | #if BITS_PER_LONG == 32 |
| 1327 | # define WMULT_CONST (~0UL) | 1295 | # define WMULT_CONST (~0UL) |
| @@ -1336,6 +1304,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
| 1336 | */ | 1304 | */ |
| 1337 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1305 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
| 1338 | 1306 | ||
| 1307 | /* | ||
| 1308 | * delta *= weight / lw | ||
| 1309 | */ | ||
| 1339 | static unsigned long | 1310 | static unsigned long |
| 1340 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1311 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
| 1341 | struct load_weight *lw) | 1312 | struct load_weight *lw) |
| @@ -1363,12 +1334,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
| 1363 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1334 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
| 1364 | } | 1335 | } |
| 1365 | 1336 | ||
| 1366 | static inline unsigned long | ||
| 1367 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
| 1368 | { | ||
| 1369 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
| 1370 | } | ||
| 1371 | |||
| 1372 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1337 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 1373 | { | 1338 | { |
| 1374 | lw->weight += inc; | 1339 | lw->weight += inc; |
| @@ -1479,17 +1444,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
| 1479 | #ifdef CONFIG_SMP | 1444 | #ifdef CONFIG_SMP |
| 1480 | static unsigned long source_load(int cpu, int type); | 1445 | static unsigned long source_load(int cpu, int type); |
| 1481 | static unsigned long target_load(int cpu, int type); | 1446 | static unsigned long target_load(int cpu, int type); |
| 1482 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
| 1483 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1447 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
| 1484 | #else /* CONFIG_SMP */ | 1448 | |
| 1449 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1450 | { | ||
| 1451 | struct rq *rq = cpu_rq(cpu); | ||
| 1452 | |||
| 1453 | if (rq->nr_running) | ||
| 1454 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
| 1455 | |||
| 1456 | return rq->avg_load_per_task; | ||
| 1457 | } | ||
| 1485 | 1458 | ||
| 1486 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1459 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1487 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1460 | |
| 1461 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
| 1462 | |||
| 1463 | /* | ||
| 1464 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
| 1465 | * leaving it for the final time. | ||
| 1466 | */ | ||
| 1467 | static void | ||
| 1468 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
| 1469 | { | ||
| 1470 | struct task_group *parent, *child; | ||
| 1471 | |||
| 1472 | rcu_read_lock(); | ||
| 1473 | parent = &root_task_group; | ||
| 1474 | down: | ||
| 1475 | (*down)(parent, cpu, sd); | ||
| 1476 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
| 1477 | parent = child; | ||
| 1478 | goto down; | ||
| 1479 | |||
| 1480 | up: | ||
| 1481 | continue; | ||
| 1482 | } | ||
| 1483 | (*up)(parent, cpu, sd); | ||
| 1484 | |||
| 1485 | child = parent; | ||
| 1486 | parent = parent->parent; | ||
| 1487 | if (parent) | ||
| 1488 | goto up; | ||
| 1489 | rcu_read_unlock(); | ||
| 1490 | } | ||
| 1491 | |||
| 1492 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
| 1493 | |||
| 1494 | /* | ||
| 1495 | * Calculate and set the cpu's group shares. | ||
| 1496 | */ | ||
| 1497 | static void | ||
| 1498 | __update_group_shares_cpu(struct task_group *tg, int cpu, | ||
| 1499 | unsigned long sd_shares, unsigned long sd_rq_weight) | ||
| 1488 | { | 1500 | { |
| 1501 | int boost = 0; | ||
| 1502 | unsigned long shares; | ||
| 1503 | unsigned long rq_weight; | ||
| 1504 | |||
| 1505 | if (!tg->se[cpu]) | ||
| 1506 | return; | ||
| 1507 | |||
| 1508 | rq_weight = tg->cfs_rq[cpu]->load.weight; | ||
| 1509 | |||
| 1510 | /* | ||
| 1511 | * If there are currently no tasks on the cpu pretend there is one of | ||
| 1512 | * average load so that when a new task gets to run here it will not | ||
| 1513 | * get delayed by group starvation. | ||
| 1514 | */ | ||
| 1515 | if (!rq_weight) { | ||
| 1516 | boost = 1; | ||
| 1517 | rq_weight = NICE_0_LOAD; | ||
| 1518 | } | ||
| 1519 | |||
| 1520 | if (unlikely(rq_weight > sd_rq_weight)) | ||
| 1521 | rq_weight = sd_rq_weight; | ||
| 1522 | |||
| 1523 | /* | ||
| 1524 | * \Sum shares * rq_weight | ||
| 1525 | * shares = ----------------------- | ||
| 1526 | * \Sum rq_weight | ||
| 1527 | * | ||
| 1528 | */ | ||
| 1529 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | ||
| 1530 | |||
| 1531 | /* | ||
| 1532 | * record the actual number of shares, not the boosted amount. | ||
| 1533 | */ | ||
| 1534 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
| 1535 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
| 1536 | |||
| 1537 | if (shares < MIN_SHARES) | ||
| 1538 | shares = MIN_SHARES; | ||
| 1539 | else if (shares > MAX_SHARES) | ||
| 1540 | shares = MAX_SHARES; | ||
| 1541 | |||
| 1542 | __set_se_shares(tg->se[cpu], shares); | ||
| 1489 | } | 1543 | } |
| 1544 | |||
| 1545 | /* | ||
| 1546 | * Re-compute the task group their per cpu shares over the given domain. | ||
| 1547 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
| 1548 | * parent group depends on the shares of its child groups. | ||
| 1549 | */ | ||
| 1550 | static void | ||
| 1551 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1552 | { | ||
| 1553 | unsigned long rq_weight = 0; | ||
| 1554 | unsigned long shares = 0; | ||
| 1555 | int i; | ||
| 1556 | |||
| 1557 | for_each_cpu_mask(i, sd->span) { | ||
| 1558 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
| 1559 | shares += tg->cfs_rq[i]->shares; | ||
| 1560 | } | ||
| 1561 | |||
| 1562 | if ((!shares && rq_weight) || shares > tg->shares) | ||
| 1563 | shares = tg->shares; | ||
| 1564 | |||
| 1565 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
| 1566 | shares = tg->shares; | ||
| 1567 | |||
| 1568 | if (!rq_weight) | ||
| 1569 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | ||
| 1570 | |||
| 1571 | for_each_cpu_mask(i, sd->span) { | ||
| 1572 | struct rq *rq = cpu_rq(i); | ||
| 1573 | unsigned long flags; | ||
| 1574 | |||
| 1575 | spin_lock_irqsave(&rq->lock, flags); | ||
| 1576 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
| 1577 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1578 | } | ||
| 1579 | } | ||
| 1580 | |||
| 1581 | /* | ||
| 1582 | * Compute the cpu's hierarchical load factor for each task group. | ||
| 1583 | * This needs to be done in a top-down fashion because the load of a child | ||
| 1584 | * group is a fraction of its parents load. | ||
| 1585 | */ | ||
| 1586 | static void | ||
| 1587 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1588 | { | ||
| 1589 | unsigned long load; | ||
| 1590 | |||
| 1591 | if (!tg->parent) { | ||
| 1592 | load = cpu_rq(cpu)->load.weight; | ||
| 1593 | } else { | ||
| 1594 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
| 1595 | load *= tg->cfs_rq[cpu]->shares; | ||
| 1596 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
| 1597 | } | ||
| 1598 | |||
| 1599 | tg->cfs_rq[cpu]->h_load = load; | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | static void | ||
| 1603 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
| 1604 | { | ||
| 1605 | } | ||
| 1606 | |||
| 1607 | static void update_shares(struct sched_domain *sd) | ||
| 1608 | { | ||
| 1609 | u64 now = cpu_clock(raw_smp_processor_id()); | ||
| 1610 | s64 elapsed = now - sd->last_update; | ||
| 1611 | |||
| 1612 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
| 1613 | sd->last_update = now; | ||
| 1614 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | ||
| 1615 | } | ||
| 1616 | } | ||
| 1617 | |||
| 1618 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
| 1619 | { | ||
| 1620 | spin_unlock(&rq->lock); | ||
| 1621 | update_shares(sd); | ||
| 1622 | spin_lock(&rq->lock); | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | static void update_h_load(int cpu) | ||
| 1626 | { | ||
| 1627 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | ||
| 1628 | } | ||
| 1629 | |||
| 1630 | #else | ||
| 1631 | |||
| 1632 | static inline void update_shares(struct sched_domain *sd) | ||
| 1633 | { | ||
| 1634 | } | ||
| 1635 | |||
| 1636 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
| 1637 | { | ||
| 1638 | } | ||
| 1639 | |||
| 1490 | #endif | 1640 | #endif |
| 1491 | 1641 | ||
| 1492 | #endif /* CONFIG_SMP */ | 1642 | #endif |
| 1643 | |||
| 1644 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1645 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
| 1646 | { | ||
| 1647 | #ifdef CONFIG_SMP | ||
| 1648 | cfs_rq->shares = shares; | ||
| 1649 | #endif | ||
| 1650 | } | ||
| 1651 | #endif | ||
| 1493 | 1652 | ||
| 1494 | #include "sched_stats.h" | 1653 | #include "sched_stats.h" |
| 1495 | #include "sched_idletask.c" | 1654 | #include "sched_idletask.c" |
| @@ -1500,27 +1659,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
| 1500 | #endif | 1659 | #endif |
| 1501 | 1660 | ||
| 1502 | #define sched_class_highest (&rt_sched_class) | 1661 | #define sched_class_highest (&rt_sched_class) |
| 1662 | #define for_each_class(class) \ | ||
| 1663 | for (class = sched_class_highest; class; class = class->next) | ||
| 1503 | 1664 | ||
| 1504 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1665 | static void inc_nr_running(struct rq *rq) |
| 1505 | { | ||
| 1506 | update_load_add(&rq->load, p->se.load.weight); | ||
| 1507 | } | ||
| 1508 | |||
| 1509 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
| 1510 | { | ||
| 1511 | update_load_sub(&rq->load, p->se.load.weight); | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
| 1515 | { | 1666 | { |
| 1516 | rq->nr_running++; | 1667 | rq->nr_running++; |
| 1517 | inc_load(rq, p); | ||
| 1518 | } | 1668 | } |
| 1519 | 1669 | ||
| 1520 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1670 | static void dec_nr_running(struct rq *rq) |
| 1521 | { | 1671 | { |
| 1522 | rq->nr_running--; | 1672 | rq->nr_running--; |
| 1523 | dec_load(rq, p); | ||
| 1524 | } | 1673 | } |
| 1525 | 1674 | ||
| 1526 | static void set_load_weight(struct task_struct *p) | 1675 | static void set_load_weight(struct task_struct *p) |
| @@ -1544,6 +1693,12 @@ static void set_load_weight(struct task_struct *p) | |||
| 1544 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1693 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
| 1545 | } | 1694 | } |
| 1546 | 1695 | ||
| 1696 | static void update_avg(u64 *avg, u64 sample) | ||
| 1697 | { | ||
| 1698 | s64 diff = sample - *avg; | ||
| 1699 | *avg += diff >> 3; | ||
| 1700 | } | ||
| 1701 | |||
| 1547 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1702 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
| 1548 | { | 1703 | { |
| 1549 | sched_info_queued(p); | 1704 | sched_info_queued(p); |
| @@ -1553,6 +1708,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 1553 | 1708 | ||
| 1554 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1709 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
| 1555 | { | 1710 | { |
| 1711 | if (sleep && p->se.last_wakeup) { | ||
| 1712 | update_avg(&p->se.avg_overlap, | ||
| 1713 | p->se.sum_exec_runtime - p->se.last_wakeup); | ||
| 1714 | p->se.last_wakeup = 0; | ||
| 1715 | } | ||
| 1716 | |||
| 1717 | sched_info_dequeued(p); | ||
| 1556 | p->sched_class->dequeue_task(rq, p, sleep); | 1718 | p->sched_class->dequeue_task(rq, p, sleep); |
| 1557 | p->se.on_rq = 0; | 1719 | p->se.on_rq = 0; |
| 1558 | } | 1720 | } |
| @@ -1612,7 +1774,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 1612 | rq->nr_uninterruptible--; | 1774 | rq->nr_uninterruptible--; |
| 1613 | 1775 | ||
| 1614 | enqueue_task(rq, p, wakeup); | 1776 | enqueue_task(rq, p, wakeup); |
| 1615 | inc_nr_running(p, rq); | 1777 | inc_nr_running(rq); |
| 1616 | } | 1778 | } |
| 1617 | 1779 | ||
| 1618 | /* | 1780 | /* |
| @@ -1624,7 +1786,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
| 1624 | rq->nr_uninterruptible++; | 1786 | rq->nr_uninterruptible++; |
| 1625 | 1787 | ||
| 1626 | dequeue_task(rq, p, sleep); | 1788 | dequeue_task(rq, p, sleep); |
| 1627 | dec_nr_running(p, rq); | 1789 | dec_nr_running(rq); |
| 1628 | } | 1790 | } |
| 1629 | 1791 | ||
| 1630 | /** | 1792 | /** |
| @@ -1636,12 +1798,6 @@ inline int task_curr(const struct task_struct *p) | |||
| 1636 | return cpu_curr(task_cpu(p)) == p; | 1798 | return cpu_curr(task_cpu(p)) == p; |
| 1637 | } | 1799 | } |
| 1638 | 1800 | ||
| 1639 | /* Used instead of source_load when we know the type == 0 */ | ||
| 1640 | unsigned long weighted_cpuload(const int cpu) | ||
| 1641 | { | ||
| 1642 | return cpu_rq(cpu)->load.weight; | ||
| 1643 | } | ||
| 1644 | |||
| 1645 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1801 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
| 1646 | { | 1802 | { |
| 1647 | set_task_rq(p, cpu); | 1803 | set_task_rq(p, cpu); |
| @@ -1670,6 +1826,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 1670 | 1826 | ||
| 1671 | #ifdef CONFIG_SMP | 1827 | #ifdef CONFIG_SMP |
| 1672 | 1828 | ||
| 1829 | /* Used instead of source_load when we know the type == 0 */ | ||
| 1830 | static unsigned long weighted_cpuload(const int cpu) | ||
| 1831 | { | ||
| 1832 | return cpu_rq(cpu)->load.weight; | ||
| 1833 | } | ||
| 1834 | |||
| 1673 | /* | 1835 | /* |
| 1674 | * Is this task likely cache-hot: | 1836 | * Is this task likely cache-hot: |
| 1675 | */ | 1837 | */ |
| @@ -1880,7 +2042,7 @@ static unsigned long source_load(int cpu, int type) | |||
| 1880 | struct rq *rq = cpu_rq(cpu); | 2042 | struct rq *rq = cpu_rq(cpu); |
| 1881 | unsigned long total = weighted_cpuload(cpu); | 2043 | unsigned long total = weighted_cpuload(cpu); |
| 1882 | 2044 | ||
| 1883 | if (type == 0) | 2045 | if (type == 0 || !sched_feat(LB_BIAS)) |
| 1884 | return total; | 2046 | return total; |
| 1885 | 2047 | ||
| 1886 | return min(rq->cpu_load[type-1], total); | 2048 | return min(rq->cpu_load[type-1], total); |
| @@ -1895,25 +2057,13 @@ static unsigned long target_load(int cpu, int type) | |||
| 1895 | struct rq *rq = cpu_rq(cpu); | 2057 | struct rq *rq = cpu_rq(cpu); |
| 1896 | unsigned long total = weighted_cpuload(cpu); | 2058 | unsigned long total = weighted_cpuload(cpu); |
| 1897 | 2059 | ||
| 1898 | if (type == 0) | 2060 | if (type == 0 || !sched_feat(LB_BIAS)) |
| 1899 | return total; | 2061 | return total; |
| 1900 | 2062 | ||
| 1901 | return max(rq->cpu_load[type-1], total); | 2063 | return max(rq->cpu_load[type-1], total); |
| 1902 | } | 2064 | } |
| 1903 | 2065 | ||
| 1904 | /* | 2066 | /* |
| 1905 | * Return the average load per task on the cpu's run queue | ||
| 1906 | */ | ||
| 1907 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1908 | { | ||
| 1909 | struct rq *rq = cpu_rq(cpu); | ||
| 1910 | unsigned long total = weighted_cpuload(cpu); | ||
| 1911 | unsigned long n = rq->nr_running; | ||
| 1912 | |||
| 1913 | return n ? total / n : SCHED_LOAD_SCALE; | ||
| 1914 | } | ||
| 1915 | |||
| 1916 | /* | ||
| 1917 | * find_idlest_group finds and returns the least busy CPU group within the | 2067 | * find_idlest_group finds and returns the least busy CPU group within the |
| 1918 | * domain. | 2068 | * domain. |
| 1919 | */ | 2069 | */ |
| @@ -2019,6 +2169,9 @@ static int sched_balance_self(int cpu, int flag) | |||
| 2019 | sd = tmp; | 2169 | sd = tmp; |
| 2020 | } | 2170 | } |
| 2021 | 2171 | ||
| 2172 | if (sd) | ||
| 2173 | update_shares(sd); | ||
| 2174 | |||
| 2022 | while (sd) { | 2175 | while (sd) { |
| 2023 | cpumask_t span, tmpmask; | 2176 | cpumask_t span, tmpmask; |
| 2024 | struct sched_group *group; | 2177 | struct sched_group *group; |
| @@ -2085,6 +2238,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 2085 | if (!sched_feat(SYNC_WAKEUPS)) | 2238 | if (!sched_feat(SYNC_WAKEUPS)) |
| 2086 | sync = 0; | 2239 | sync = 0; |
| 2087 | 2240 | ||
| 2241 | #ifdef CONFIG_SMP | ||
| 2242 | if (sched_feat(LB_WAKEUP_UPDATE)) { | ||
| 2243 | struct sched_domain *sd; | ||
| 2244 | |||
| 2245 | this_cpu = raw_smp_processor_id(); | ||
| 2246 | cpu = task_cpu(p); | ||
| 2247 | |||
| 2248 | for_each_domain(this_cpu, sd) { | ||
| 2249 | if (cpu_isset(cpu, sd->span)) { | ||
| 2250 | update_shares(sd); | ||
| 2251 | break; | ||
| 2252 | } | ||
| 2253 | } | ||
| 2254 | } | ||
| 2255 | #endif | ||
| 2256 | |||
| 2088 | smp_wmb(); | 2257 | smp_wmb(); |
| 2089 | rq = task_rq_lock(p, &flags); | 2258 | rq = task_rq_lock(p, &flags); |
| 2090 | old_state = p->state; | 2259 | old_state = p->state; |
| @@ -2131,7 +2300,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 2131 | } | 2300 | } |
| 2132 | } | 2301 | } |
| 2133 | } | 2302 | } |
| 2134 | #endif | 2303 | #endif /* CONFIG_SCHEDSTATS */ |
| 2135 | 2304 | ||
| 2136 | out_activate: | 2305 | out_activate: |
| 2137 | #endif /* CONFIG_SMP */ | 2306 | #endif /* CONFIG_SMP */ |
| @@ -2157,6 +2326,8 @@ out_running: | |||
| 2157 | p->sched_class->task_wake_up(rq, p); | 2326 | p->sched_class->task_wake_up(rq, p); |
| 2158 | #endif | 2327 | #endif |
| 2159 | out: | 2328 | out: |
| 2329 | current->se.last_wakeup = current->se.sum_exec_runtime; | ||
| 2330 | |||
| 2160 | task_rq_unlock(rq, &flags); | 2331 | task_rq_unlock(rq, &flags); |
| 2161 | 2332 | ||
| 2162 | return success; | 2333 | return success; |
| @@ -2277,7 +2448,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2277 | * management (if any): | 2448 | * management (if any): |
| 2278 | */ | 2449 | */ |
| 2279 | p->sched_class->task_new(rq, p); | 2450 | p->sched_class->task_new(rq, p); |
| 2280 | inc_nr_running(p, rq); | 2451 | inc_nr_running(rq); |
| 2281 | } | 2452 | } |
| 2282 | check_preempt_curr(rq, p); | 2453 | check_preempt_curr(rq, p); |
| 2283 | #ifdef CONFIG_SMP | 2454 | #ifdef CONFIG_SMP |
| @@ -2331,7 +2502,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
| 2331 | notifier->ops->sched_out(notifier, next); | 2502 | notifier->ops->sched_out(notifier, next); |
| 2332 | } | 2503 | } |
| 2333 | 2504 | ||
| 2334 | #else | 2505 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ |
| 2335 | 2506 | ||
| 2336 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2507 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
| 2337 | { | 2508 | { |
| @@ -2343,7 +2514,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
| 2343 | { | 2514 | { |
| 2344 | } | 2515 | } |
| 2345 | 2516 | ||
| 2346 | #endif | 2517 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ |
| 2347 | 2518 | ||
| 2348 | /** | 2519 | /** |
| 2349 | * prepare_task_switch - prepare to switch tasks | 2520 | * prepare_task_switch - prepare to switch tasks |
| @@ -2785,7 +2956,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2785 | enum cpu_idle_type idle, int *all_pinned, | 2956 | enum cpu_idle_type idle, int *all_pinned, |
| 2786 | int *this_best_prio, struct rq_iterator *iterator) | 2957 | int *this_best_prio, struct rq_iterator *iterator) |
| 2787 | { | 2958 | { |
| 2788 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; | 2959 | int loops = 0, pulled = 0, pinned = 0; |
| 2789 | struct task_struct *p; | 2960 | struct task_struct *p; |
| 2790 | long rem_load_move = max_load_move; | 2961 | long rem_load_move = max_load_move; |
| 2791 | 2962 | ||
| @@ -2801,14 +2972,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2801 | next: | 2972 | next: |
| 2802 | if (!p || loops++ > sysctl_sched_nr_migrate) | 2973 | if (!p || loops++ > sysctl_sched_nr_migrate) |
| 2803 | goto out; | 2974 | goto out; |
| 2804 | /* | 2975 | |
| 2805 | * To help distribute high priority tasks across CPUs we don't | 2976 | if ((p->se.load.weight >> 1) > rem_load_move || |
| 2806 | * skip a task if it will be the highest priority task (i.e. smallest | ||
| 2807 | * prio value) on its new queue regardless of its load weight | ||
| 2808 | */ | ||
| 2809 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | ||
| 2810 | SCHED_LOAD_SCALE_FUZZ; | ||
| 2811 | if ((skip_for_load && p->prio >= *this_best_prio) || | ||
| 2812 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2977 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
| 2813 | p = iterator->next(iterator->arg); | 2978 | p = iterator->next(iterator->arg); |
| 2814 | goto next; | 2979 | goto next; |
| @@ -2863,6 +3028,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2863 | max_load_move - total_load_moved, | 3028 | max_load_move - total_load_moved, |
| 2864 | sd, idle, all_pinned, &this_best_prio); | 3029 | sd, idle, all_pinned, &this_best_prio); |
| 2865 | class = class->next; | 3030 | class = class->next; |
| 3031 | |||
| 3032 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
| 3033 | break; | ||
| 3034 | |||
| 2866 | } while (class && max_load_move > total_load_moved); | 3035 | } while (class && max_load_move > total_load_moved); |
| 2867 | 3036 | ||
| 2868 | return total_load_moved > 0; | 3037 | return total_load_moved > 0; |
| @@ -2939,6 +3108,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2939 | max_load = this_load = total_load = total_pwr = 0; | 3108 | max_load = this_load = total_load = total_pwr = 0; |
| 2940 | busiest_load_per_task = busiest_nr_running = 0; | 3109 | busiest_load_per_task = busiest_nr_running = 0; |
| 2941 | this_load_per_task = this_nr_running = 0; | 3110 | this_load_per_task = this_nr_running = 0; |
| 3111 | |||
| 2942 | if (idle == CPU_NOT_IDLE) | 3112 | if (idle == CPU_NOT_IDLE) |
| 2943 | load_idx = sd->busy_idx; | 3113 | load_idx = sd->busy_idx; |
| 2944 | else if (idle == CPU_NEWLY_IDLE) | 3114 | else if (idle == CPU_NEWLY_IDLE) |
| @@ -2953,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2953 | int __group_imb = 0; | 3123 | int __group_imb = 0; |
| 2954 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3124 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
| 2955 | unsigned long sum_nr_running, sum_weighted_load; | 3125 | unsigned long sum_nr_running, sum_weighted_load; |
| 3126 | unsigned long sum_avg_load_per_task; | ||
| 3127 | unsigned long avg_load_per_task; | ||
| 2956 | 3128 | ||
| 2957 | local_group = cpu_isset(this_cpu, group->cpumask); | 3129 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 2958 | 3130 | ||
| @@ -2961,6 +3133,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2961 | 3133 | ||
| 2962 | /* Tally up the load of all CPUs in the group */ | 3134 | /* Tally up the load of all CPUs in the group */ |
| 2963 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3135 | sum_weighted_load = sum_nr_running = avg_load = 0; |
| 3136 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
| 3137 | |||
| 2964 | max_cpu_load = 0; | 3138 | max_cpu_load = 0; |
| 2965 | min_cpu_load = ~0UL; | 3139 | min_cpu_load = ~0UL; |
| 2966 | 3140 | ||
| @@ -2994,6 +3168,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2994 | avg_load += load; | 3168 | avg_load += load; |
| 2995 | sum_nr_running += rq->nr_running; | 3169 | sum_nr_running += rq->nr_running; |
| 2996 | sum_weighted_load += weighted_cpuload(i); | 3170 | sum_weighted_load += weighted_cpuload(i); |
| 3171 | |||
| 3172 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
| 2997 | } | 3173 | } |
| 2998 | 3174 | ||
| 2999 | /* | 3175 | /* |
| @@ -3015,7 +3191,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 3015 | avg_load = sg_div_cpu_power(group, | 3191 | avg_load = sg_div_cpu_power(group, |
| 3016 | avg_load * SCHED_LOAD_SCALE); | 3192 | avg_load * SCHED_LOAD_SCALE); |
| 3017 | 3193 | ||
| 3018 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 3194 | |
| 3195 | /* | ||
| 3196 | * Consider the group unbalanced when the imbalance is larger | ||
| 3197 | * than the average weight of two tasks. | ||
| 3198 | * | ||
| 3199 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 3200 | * might not be a suitable number - should we keep a | ||
| 3201 | * normalized nr_running number somewhere that negates | ||
| 3202 | * the hierarchy? | ||
| 3203 | */ | ||
| 3204 | avg_load_per_task = sg_div_cpu_power(group, | ||
| 3205 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | ||
| 3206 | |||
| 3207 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
| 3019 | __group_imb = 1; | 3208 | __group_imb = 1; |
| 3020 | 3209 | ||
| 3021 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3210 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
| @@ -3156,9 +3345,9 @@ small_imbalance: | |||
| 3156 | if (busiest_load_per_task > this_load_per_task) | 3345 | if (busiest_load_per_task > this_load_per_task) |
| 3157 | imbn = 1; | 3346 | imbn = 1; |
| 3158 | } else | 3347 | } else |
| 3159 | this_load_per_task = SCHED_LOAD_SCALE; | 3348 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
| 3160 | 3349 | ||
| 3161 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 3350 | if (max_load - this_load + 2*busiest_load_per_task >= |
| 3162 | busiest_load_per_task * imbn) { | 3351 | busiest_load_per_task * imbn) { |
| 3163 | *imbalance = busiest_load_per_task; | 3352 | *imbalance = busiest_load_per_task; |
| 3164 | return busiest; | 3353 | return busiest; |
| @@ -3284,6 +3473,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 3284 | schedstat_inc(sd, lb_count[idle]); | 3473 | schedstat_inc(sd, lb_count[idle]); |
| 3285 | 3474 | ||
| 3286 | redo: | 3475 | redo: |
| 3476 | update_shares(sd); | ||
| 3287 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3477 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| 3288 | cpus, balance); | 3478 | cpus, balance); |
| 3289 | 3479 | ||
| @@ -3386,8 +3576,9 @@ redo: | |||
| 3386 | 3576 | ||
| 3387 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3577 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 3388 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3578 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 3389 | return -1; | 3579 | ld_moved = -1; |
| 3390 | return ld_moved; | 3580 | |
| 3581 | goto out; | ||
| 3391 | 3582 | ||
| 3392 | out_balanced: | 3583 | out_balanced: |
| 3393 | schedstat_inc(sd, lb_balanced[idle]); | 3584 | schedstat_inc(sd, lb_balanced[idle]); |
| @@ -3402,8 +3593,13 @@ out_one_pinned: | |||
| 3402 | 3593 | ||
| 3403 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3594 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 3404 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3595 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 3405 | return -1; | 3596 | ld_moved = -1; |
| 3406 | return 0; | 3597 | else |
| 3598 | ld_moved = 0; | ||
| 3599 | out: | ||
| 3600 | if (ld_moved) | ||
| 3601 | update_shares(sd); | ||
| 3602 | return ld_moved; | ||
| 3407 | } | 3603 | } |
| 3408 | 3604 | ||
| 3409 | /* | 3605 | /* |
| @@ -3438,6 +3634,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, | |||
| 3438 | 3634 | ||
| 3439 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3635 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
| 3440 | redo: | 3636 | redo: |
| 3637 | update_shares_locked(this_rq, sd); | ||
| 3441 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3638 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
| 3442 | &sd_idle, cpus, NULL); | 3639 | &sd_idle, cpus, NULL); |
| 3443 | if (!group) { | 3640 | if (!group) { |
| @@ -3481,6 +3678,7 @@ redo: | |||
| 3481 | } else | 3678 | } else |
| 3482 | sd->nr_balance_failed = 0; | 3679 | sd->nr_balance_failed = 0; |
| 3483 | 3680 | ||
| 3681 | update_shares_locked(this_rq, sd); | ||
| 3484 | return ld_moved; | 3682 | return ld_moved; |
| 3485 | 3683 | ||
| 3486 | out_balanced: | 3684 | out_balanced: |
| @@ -3672,6 +3870,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3672 | /* Earliest time when we have to do rebalance again */ | 3870 | /* Earliest time when we have to do rebalance again */ |
| 3673 | unsigned long next_balance = jiffies + 60*HZ; | 3871 | unsigned long next_balance = jiffies + 60*HZ; |
| 3674 | int update_next_balance = 0; | 3872 | int update_next_balance = 0; |
| 3873 | int need_serialize; | ||
| 3675 | cpumask_t tmp; | 3874 | cpumask_t tmp; |
| 3676 | 3875 | ||
| 3677 | for_each_domain(cpu, sd) { | 3876 | for_each_domain(cpu, sd) { |
| @@ -3689,8 +3888,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3689 | if (interval > HZ*NR_CPUS/10) | 3888 | if (interval > HZ*NR_CPUS/10) |
| 3690 | interval = HZ*NR_CPUS/10; | 3889 | interval = HZ*NR_CPUS/10; |
| 3691 | 3890 | ||
| 3891 | need_serialize = sd->flags & SD_SERIALIZE; | ||
| 3692 | 3892 | ||
| 3693 | if (sd->flags & SD_SERIALIZE) { | 3893 | if (need_serialize) { |
| 3694 | if (!spin_trylock(&balancing)) | 3894 | if (!spin_trylock(&balancing)) |
| 3695 | goto out; | 3895 | goto out; |
| 3696 | } | 3896 | } |
| @@ -3706,7 +3906,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3706 | } | 3906 | } |
| 3707 | sd->last_balance = jiffies; | 3907 | sd->last_balance = jiffies; |
| 3708 | } | 3908 | } |
| 3709 | if (sd->flags & SD_SERIALIZE) | 3909 | if (need_serialize) |
| 3710 | spin_unlock(&balancing); | 3910 | spin_unlock(&balancing); |
| 3711 | out: | 3911 | out: |
| 3712 | if (time_after(next_balance, sd->last_balance + interval)) { | 3912 | if (time_after(next_balance, sd->last_balance + interval)) { |
| @@ -4070,6 +4270,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 4070 | prev->comm, prev->pid, preempt_count()); | 4270 | prev->comm, prev->pid, preempt_count()); |
| 4071 | 4271 | ||
| 4072 | debug_show_held_locks(prev); | 4272 | debug_show_held_locks(prev); |
| 4273 | print_modules(); | ||
| 4073 | if (irqs_disabled()) | 4274 | if (irqs_disabled()) |
| 4074 | print_irqtrace_events(prev); | 4275 | print_irqtrace_events(prev); |
| 4075 | 4276 | ||
| @@ -4143,7 +4344,7 @@ asmlinkage void __sched schedule(void) | |||
| 4143 | struct task_struct *prev, *next; | 4344 | struct task_struct *prev, *next; |
| 4144 | unsigned long *switch_count; | 4345 | unsigned long *switch_count; |
| 4145 | struct rq *rq; | 4346 | struct rq *rq; |
| 4146 | int cpu; | 4347 | int cpu, hrtick = sched_feat(HRTICK); |
| 4147 | 4348 | ||
| 4148 | need_resched: | 4349 | need_resched: |
| 4149 | preempt_disable(); | 4350 | preempt_disable(); |
| @@ -4158,7 +4359,8 @@ need_resched_nonpreemptible: | |||
| 4158 | 4359 | ||
| 4159 | schedule_debug(prev); | 4360 | schedule_debug(prev); |
| 4160 | 4361 | ||
| 4161 | hrtick_clear(rq); | 4362 | if (hrtick) |
| 4363 | hrtick_clear(rq); | ||
| 4162 | 4364 | ||
| 4163 | /* | 4365 | /* |
| 4164 | * Do the rq-clock update outside the rq lock: | 4366 | * Do the rq-clock update outside the rq lock: |
| @@ -4204,7 +4406,8 @@ need_resched_nonpreemptible: | |||
| 4204 | } else | 4406 | } else |
| 4205 | spin_unlock_irq(&rq->lock); | 4407 | spin_unlock_irq(&rq->lock); |
| 4206 | 4408 | ||
| 4207 | hrtick_set(rq); | 4409 | if (hrtick) |
| 4410 | hrtick_set(rq); | ||
| 4208 | 4411 | ||
| 4209 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 4412 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
| 4210 | goto need_resched_nonpreemptible; | 4413 | goto need_resched_nonpreemptible; |
| @@ -4586,10 +4789,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4586 | goto out_unlock; | 4789 | goto out_unlock; |
| 4587 | } | 4790 | } |
| 4588 | on_rq = p->se.on_rq; | 4791 | on_rq = p->se.on_rq; |
| 4589 | if (on_rq) { | 4792 | if (on_rq) |
| 4590 | dequeue_task(rq, p, 0); | 4793 | dequeue_task(rq, p, 0); |
| 4591 | dec_load(rq, p); | ||
| 4592 | } | ||
| 4593 | 4794 | ||
| 4594 | p->static_prio = NICE_TO_PRIO(nice); | 4795 | p->static_prio = NICE_TO_PRIO(nice); |
| 4595 | set_load_weight(p); | 4796 | set_load_weight(p); |
| @@ -4599,7 +4800,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4599 | 4800 | ||
| 4600 | if (on_rq) { | 4801 | if (on_rq) { |
| 4601 | enqueue_task(rq, p, 0); | 4802 | enqueue_task(rq, p, 0); |
| 4602 | inc_load(rq, p); | ||
| 4603 | /* | 4803 | /* |
| 4604 | * If the task increased its priority or is running and | 4804 | * If the task increased its priority or is running and |
| 4605 | * lowered its priority, then reschedule its CPU: | 4805 | * lowered its priority, then reschedule its CPU: |
| @@ -5070,24 +5270,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 5070 | return sched_setaffinity(pid, &new_mask); | 5270 | return sched_setaffinity(pid, &new_mask); |
| 5071 | } | 5271 | } |
| 5072 | 5272 | ||
| 5073 | /* | ||
| 5074 | * Represents all cpu's present in the system | ||
| 5075 | * In systems capable of hotplug, this map could dynamically grow | ||
| 5076 | * as new cpu's are detected in the system via any platform specific | ||
| 5077 | * method, such as ACPI for e.g. | ||
| 5078 | */ | ||
| 5079 | |||
| 5080 | cpumask_t cpu_present_map __read_mostly; | ||
| 5081 | EXPORT_SYMBOL(cpu_present_map); | ||
| 5082 | |||
| 5083 | #ifndef CONFIG_SMP | ||
| 5084 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
| 5085 | EXPORT_SYMBOL(cpu_online_map); | ||
| 5086 | |||
| 5087 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
| 5088 | EXPORT_SYMBOL(cpu_possible_map); | ||
| 5089 | #endif | ||
| 5090 | |||
| 5091 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 5273 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
| 5092 | { | 5274 | { |
| 5093 | struct task_struct *p; | 5275 | struct task_struct *p; |
| @@ -5571,6 +5753,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) | |||
| 5571 | goto out; | 5753 | goto out; |
| 5572 | } | 5754 | } |
| 5573 | 5755 | ||
| 5756 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | ||
| 5757 | !cpus_equal(p->cpus_allowed, *new_mask))) { | ||
| 5758 | ret = -EINVAL; | ||
| 5759 | goto out; | ||
| 5760 | } | ||
| 5761 | |||
| 5574 | if (p->sched_class->set_cpus_allowed) | 5762 | if (p->sched_class->set_cpus_allowed) |
| 5575 | p->sched_class->set_cpus_allowed(p, new_mask); | 5763 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 5576 | else { | 5764 | else { |
| @@ -6060,6 +6248,36 @@ static void unregister_sched_domain_sysctl(void) | |||
| 6060 | } | 6248 | } |
| 6061 | #endif | 6249 | #endif |
| 6062 | 6250 | ||
| 6251 | static void set_rq_online(struct rq *rq) | ||
| 6252 | { | ||
| 6253 | if (!rq->online) { | ||
| 6254 | const struct sched_class *class; | ||
| 6255 | |||
| 6256 | cpu_set(rq->cpu, rq->rd->online); | ||
| 6257 | rq->online = 1; | ||
| 6258 | |||
| 6259 | for_each_class(class) { | ||
| 6260 | if (class->rq_online) | ||
| 6261 | class->rq_online(rq); | ||
| 6262 | } | ||
| 6263 | } | ||
| 6264 | } | ||
| 6265 | |||
| 6266 | static void set_rq_offline(struct rq *rq) | ||
| 6267 | { | ||
| 6268 | if (rq->online) { | ||
| 6269 | const struct sched_class *class; | ||
| 6270 | |||
| 6271 | for_each_class(class) { | ||
| 6272 | if (class->rq_offline) | ||
| 6273 | class->rq_offline(rq); | ||
| 6274 | } | ||
| 6275 | |||
| 6276 | cpu_clear(rq->cpu, rq->rd->online); | ||
| 6277 | rq->online = 0; | ||
| 6278 | } | ||
| 6279 | } | ||
| 6280 | |||
| 6063 | /* | 6281 | /* |
| 6064 | * migration_call - callback that gets triggered when a CPU is added. | 6282 | * migration_call - callback that gets triggered when a CPU is added. |
| 6065 | * Here we can start up the necessary migration thread for the new CPU. | 6283 | * Here we can start up the necessary migration thread for the new CPU. |
| @@ -6097,7 +6315,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 6097 | spin_lock_irqsave(&rq->lock, flags); | 6315 | spin_lock_irqsave(&rq->lock, flags); |
| 6098 | if (rq->rd) { | 6316 | if (rq->rd) { |
| 6099 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6317 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
| 6100 | cpu_set(cpu, rq->rd->online); | 6318 | |
| 6319 | set_rq_online(rq); | ||
| 6101 | } | 6320 | } |
| 6102 | spin_unlock_irqrestore(&rq->lock, flags); | 6321 | spin_unlock_irqrestore(&rq->lock, flags); |
| 6103 | break; | 6322 | break; |
| @@ -6158,7 +6377,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 6158 | spin_lock_irqsave(&rq->lock, flags); | 6377 | spin_lock_irqsave(&rq->lock, flags); |
| 6159 | if (rq->rd) { | 6378 | if (rq->rd) { |
| 6160 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6379 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
| 6161 | cpu_clear(cpu, rq->rd->online); | 6380 | set_rq_offline(rq); |
| 6162 | } | 6381 | } |
| 6163 | spin_unlock_irqrestore(&rq->lock, flags); | 6382 | spin_unlock_irqrestore(&rq->lock, flags); |
| 6164 | break; | 6383 | break; |
| @@ -6192,6 +6411,28 @@ void __init migration_init(void) | |||
| 6192 | 6411 | ||
| 6193 | #ifdef CONFIG_SCHED_DEBUG | 6412 | #ifdef CONFIG_SCHED_DEBUG |
| 6194 | 6413 | ||
| 6414 | static inline const char *sd_level_to_string(enum sched_domain_level lvl) | ||
| 6415 | { | ||
| 6416 | switch (lvl) { | ||
| 6417 | case SD_LV_NONE: | ||
| 6418 | return "NONE"; | ||
| 6419 | case SD_LV_SIBLING: | ||
| 6420 | return "SIBLING"; | ||
| 6421 | case SD_LV_MC: | ||
| 6422 | return "MC"; | ||
| 6423 | case SD_LV_CPU: | ||
| 6424 | return "CPU"; | ||
| 6425 | case SD_LV_NODE: | ||
| 6426 | return "NODE"; | ||
| 6427 | case SD_LV_ALLNODES: | ||
| 6428 | return "ALLNODES"; | ||
| 6429 | case SD_LV_MAX: | ||
| 6430 | return "MAX"; | ||
| 6431 | |||
| 6432 | } | ||
| 6433 | return "MAX"; | ||
| 6434 | } | ||
| 6435 | |||
| 6195 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6436 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 6196 | cpumask_t *groupmask) | 6437 | cpumask_t *groupmask) |
| 6197 | { | 6438 | { |
| @@ -6211,7 +6452,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 6211 | return -1; | 6452 | return -1; |
| 6212 | } | 6453 | } |
| 6213 | 6454 | ||
| 6214 | printk(KERN_CONT "span %s\n", str); | 6455 | printk(KERN_CONT "span %s level %s\n", |
| 6456 | str, sd_level_to_string(sd->level)); | ||
| 6215 | 6457 | ||
| 6216 | if (!cpu_isset(cpu, sd->span)) { | 6458 | if (!cpu_isset(cpu, sd->span)) { |
| 6217 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6459 | printk(KERN_ERR "ERROR: domain->span does not contain " |
| @@ -6295,9 +6537,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 6295 | } | 6537 | } |
| 6296 | kfree(groupmask); | 6538 | kfree(groupmask); |
| 6297 | } | 6539 | } |
| 6298 | #else | 6540 | #else /* !CONFIG_SCHED_DEBUG */ |
| 6299 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6541 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| 6300 | #endif | 6542 | #endif /* CONFIG_SCHED_DEBUG */ |
| 6301 | 6543 | ||
| 6302 | static int sd_degenerate(struct sched_domain *sd) | 6544 | static int sd_degenerate(struct sched_domain *sd) |
| 6303 | { | 6545 | { |
| @@ -6357,20 +6599,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 6357 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6599 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
| 6358 | { | 6600 | { |
| 6359 | unsigned long flags; | 6601 | unsigned long flags; |
| 6360 | const struct sched_class *class; | ||
| 6361 | 6602 | ||
| 6362 | spin_lock_irqsave(&rq->lock, flags); | 6603 | spin_lock_irqsave(&rq->lock, flags); |
| 6363 | 6604 | ||
| 6364 | if (rq->rd) { | 6605 | if (rq->rd) { |
| 6365 | struct root_domain *old_rd = rq->rd; | 6606 | struct root_domain *old_rd = rq->rd; |
| 6366 | 6607 | ||
| 6367 | for (class = sched_class_highest; class; class = class->next) { | 6608 | if (cpu_isset(rq->cpu, old_rd->online)) |
| 6368 | if (class->leave_domain) | 6609 | set_rq_offline(rq); |
| 6369 | class->leave_domain(rq); | ||
| 6370 | } | ||
| 6371 | 6610 | ||
| 6372 | cpu_clear(rq->cpu, old_rd->span); | 6611 | cpu_clear(rq->cpu, old_rd->span); |
| 6373 | cpu_clear(rq->cpu, old_rd->online); | ||
| 6374 | 6612 | ||
| 6375 | if (atomic_dec_and_test(&old_rd->refcount)) | 6613 | if (atomic_dec_and_test(&old_rd->refcount)) |
| 6376 | kfree(old_rd); | 6614 | kfree(old_rd); |
| @@ -6381,12 +6619,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 6381 | 6619 | ||
| 6382 | cpu_set(rq->cpu, rd->span); | 6620 | cpu_set(rq->cpu, rd->span); |
| 6383 | if (cpu_isset(rq->cpu, cpu_online_map)) | 6621 | if (cpu_isset(rq->cpu, cpu_online_map)) |
| 6384 | cpu_set(rq->cpu, rd->online); | 6622 | set_rq_online(rq); |
| 6385 | |||
| 6386 | for (class = sched_class_highest; class; class = class->next) { | ||
| 6387 | if (class->join_domain) | ||
| 6388 | class->join_domain(rq); | ||
| 6389 | } | ||
| 6390 | 6623 | ||
| 6391 | spin_unlock_irqrestore(&rq->lock, flags); | 6624 | spin_unlock_irqrestore(&rq->lock, flags); |
| 6392 | } | 6625 | } |
| @@ -6397,6 +6630,8 @@ static void init_rootdomain(struct root_domain *rd) | |||
| 6397 | 6630 | ||
| 6398 | cpus_clear(rd->span); | 6631 | cpus_clear(rd->span); |
| 6399 | cpus_clear(rd->online); | 6632 | cpus_clear(rd->online); |
| 6633 | |||
| 6634 | cpupri_init(&rd->cpupri); | ||
| 6400 | } | 6635 | } |
| 6401 | 6636 | ||
| 6402 | static void init_defrootdomain(void) | 6637 | static void init_defrootdomain(void) |
| @@ -6591,7 +6826,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) | |||
| 6591 | cpus_or(*span, *span, *nodemask); | 6826 | cpus_or(*span, *span, *nodemask); |
| 6592 | } | 6827 | } |
| 6593 | } | 6828 | } |
| 6594 | #endif | 6829 | #endif /* CONFIG_NUMA */ |
| 6595 | 6830 | ||
| 6596 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6831 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
| 6597 | 6832 | ||
| @@ -6610,7 +6845,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
| 6610 | *sg = &per_cpu(sched_group_cpus, cpu); | 6845 | *sg = &per_cpu(sched_group_cpus, cpu); |
| 6611 | return cpu; | 6846 | return cpu; |
| 6612 | } | 6847 | } |
| 6613 | #endif | 6848 | #endif /* CONFIG_SCHED_SMT */ |
| 6614 | 6849 | ||
| 6615 | /* | 6850 | /* |
| 6616 | * multi-core sched-domains: | 6851 | * multi-core sched-domains: |
| @@ -6618,7 +6853,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
| 6618 | #ifdef CONFIG_SCHED_MC | 6853 | #ifdef CONFIG_SCHED_MC |
| 6619 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6854 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
| 6620 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 6855 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
| 6621 | #endif | 6856 | #endif /* CONFIG_SCHED_MC */ |
| 6622 | 6857 | ||
| 6623 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6858 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
| 6624 | static int | 6859 | static int |
| @@ -6720,7 +6955,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 6720 | sg = sg->next; | 6955 | sg = sg->next; |
| 6721 | } while (sg != group_head); | 6956 | } while (sg != group_head); |
| 6722 | } | 6957 | } |
| 6723 | #endif | 6958 | #endif /* CONFIG_NUMA */ |
| 6724 | 6959 | ||
| 6725 | #ifdef CONFIG_NUMA | 6960 | #ifdef CONFIG_NUMA |
| 6726 | /* Free memory allocated for various sched_group structures */ | 6961 | /* Free memory allocated for various sched_group structures */ |
| @@ -6757,11 +6992,11 @@ next_sg: | |||
| 6757 | sched_group_nodes_bycpu[cpu] = NULL; | 6992 | sched_group_nodes_bycpu[cpu] = NULL; |
| 6758 | } | 6993 | } |
| 6759 | } | 6994 | } |
| 6760 | #else | 6995 | #else /* !CONFIG_NUMA */ |
| 6761 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | 6996 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
| 6762 | { | 6997 | { |
| 6763 | } | 6998 | } |
| 6764 | #endif | 6999 | #endif /* CONFIG_NUMA */ |
| 6765 | 7000 | ||
| 6766 | /* | 7001 | /* |
| 6767 | * Initialize sched groups cpu_power. | 7002 | * Initialize sched groups cpu_power. |
| @@ -7470,7 +7705,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
| 7470 | #endif | 7705 | #endif |
| 7471 | return err; | 7706 | return err; |
| 7472 | } | 7707 | } |
| 7473 | #endif | 7708 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 7474 | 7709 | ||
| 7475 | /* | 7710 | /* |
| 7476 | * Force a reinitialization of the sched domains hierarchy. The domains | 7711 | * Force a reinitialization of the sched domains hierarchy. The domains |
| @@ -7481,21 +7716,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
| 7481 | static int update_sched_domains(struct notifier_block *nfb, | 7716 | static int update_sched_domains(struct notifier_block *nfb, |
| 7482 | unsigned long action, void *hcpu) | 7717 | unsigned long action, void *hcpu) |
| 7483 | { | 7718 | { |
| 7719 | int cpu = (int)(long)hcpu; | ||
| 7720 | |||
| 7484 | switch (action) { | 7721 | switch (action) { |
| 7485 | case CPU_UP_PREPARE: | ||
| 7486 | case CPU_UP_PREPARE_FROZEN: | ||
| 7487 | case CPU_DOWN_PREPARE: | 7722 | case CPU_DOWN_PREPARE: |
| 7488 | case CPU_DOWN_PREPARE_FROZEN: | 7723 | case CPU_DOWN_PREPARE_FROZEN: |
| 7724 | disable_runtime(cpu_rq(cpu)); | ||
| 7725 | /* fall-through */ | ||
| 7726 | case CPU_UP_PREPARE: | ||
| 7727 | case CPU_UP_PREPARE_FROZEN: | ||
| 7489 | detach_destroy_domains(&cpu_online_map); | 7728 | detach_destroy_domains(&cpu_online_map); |
| 7490 | free_sched_domains(); | 7729 | free_sched_domains(); |
| 7491 | return NOTIFY_OK; | 7730 | return NOTIFY_OK; |
| 7492 | 7731 | ||
| 7493 | case CPU_UP_CANCELED: | 7732 | |
| 7494 | case CPU_UP_CANCELED_FROZEN: | ||
| 7495 | case CPU_DOWN_FAILED: | 7733 | case CPU_DOWN_FAILED: |
| 7496 | case CPU_DOWN_FAILED_FROZEN: | 7734 | case CPU_DOWN_FAILED_FROZEN: |
| 7497 | case CPU_ONLINE: | 7735 | case CPU_ONLINE: |
| 7498 | case CPU_ONLINE_FROZEN: | 7736 | case CPU_ONLINE_FROZEN: |
| 7737 | enable_runtime(cpu_rq(cpu)); | ||
| 7738 | /* fall-through */ | ||
| 7739 | case CPU_UP_CANCELED: | ||
| 7740 | case CPU_UP_CANCELED_FROZEN: | ||
| 7499 | case CPU_DEAD: | 7741 | case CPU_DEAD: |
| 7500 | case CPU_DEAD_FROZEN: | 7742 | case CPU_DEAD_FROZEN: |
| 7501 | /* | 7743 | /* |
| @@ -7695,8 +7937,8 @@ void __init sched_init(void) | |||
| 7695 | 7937 | ||
| 7696 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7938 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
| 7697 | ptr += nr_cpu_ids * sizeof(void **); | 7939 | ptr += nr_cpu_ids * sizeof(void **); |
| 7698 | #endif | 7940 | #endif /* CONFIG_USER_SCHED */ |
| 7699 | #endif | 7941 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 7700 | #ifdef CONFIG_RT_GROUP_SCHED | 7942 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7701 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7943 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
| 7702 | ptr += nr_cpu_ids * sizeof(void **); | 7944 | ptr += nr_cpu_ids * sizeof(void **); |
| @@ -7710,8 +7952,8 @@ void __init sched_init(void) | |||
| 7710 | 7952 | ||
| 7711 | root_task_group.rt_rq = (struct rt_rq **)ptr; | 7953 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
| 7712 | ptr += nr_cpu_ids * sizeof(void **); | 7954 | ptr += nr_cpu_ids * sizeof(void **); |
| 7713 | #endif | 7955 | #endif /* CONFIG_USER_SCHED */ |
| 7714 | #endif | 7956 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7715 | } | 7957 | } |
| 7716 | 7958 | ||
| 7717 | #ifdef CONFIG_SMP | 7959 | #ifdef CONFIG_SMP |
| @@ -7727,8 +7969,8 @@ void __init sched_init(void) | |||
| 7727 | #ifdef CONFIG_USER_SCHED | 7969 | #ifdef CONFIG_USER_SCHED |
| 7728 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 7970 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
| 7729 | global_rt_period(), RUNTIME_INF); | 7971 | global_rt_period(), RUNTIME_INF); |
| 7730 | #endif | 7972 | #endif /* CONFIG_USER_SCHED */ |
| 7731 | #endif | 7973 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7732 | 7974 | ||
| 7733 | #ifdef CONFIG_GROUP_SCHED | 7975 | #ifdef CONFIG_GROUP_SCHED |
| 7734 | list_add(&init_task_group.list, &task_groups); | 7976 | list_add(&init_task_group.list, &task_groups); |
| @@ -7738,8 +7980,8 @@ void __init sched_init(void) | |||
| 7738 | INIT_LIST_HEAD(&root_task_group.children); | 7980 | INIT_LIST_HEAD(&root_task_group.children); |
| 7739 | init_task_group.parent = &root_task_group; | 7981 | init_task_group.parent = &root_task_group; |
| 7740 | list_add(&init_task_group.siblings, &root_task_group.children); | 7982 | list_add(&init_task_group.siblings, &root_task_group.children); |
| 7741 | #endif | 7983 | #endif /* CONFIG_USER_SCHED */ |
| 7742 | #endif | 7984 | #endif /* CONFIG_GROUP_SCHED */ |
| 7743 | 7985 | ||
| 7744 | for_each_possible_cpu(i) { | 7986 | for_each_possible_cpu(i) { |
| 7745 | struct rq *rq; | 7987 | struct rq *rq; |
| @@ -7819,6 +8061,7 @@ void __init sched_init(void) | |||
| 7819 | rq->next_balance = jiffies; | 8061 | rq->next_balance = jiffies; |
| 7820 | rq->push_cpu = 0; | 8062 | rq->push_cpu = 0; |
| 7821 | rq->cpu = i; | 8063 | rq->cpu = i; |
| 8064 | rq->online = 0; | ||
| 7822 | rq->migration_thread = NULL; | 8065 | rq->migration_thread = NULL; |
| 7823 | INIT_LIST_HEAD(&rq->migration_queue); | 8066 | INIT_LIST_HEAD(&rq->migration_queue); |
| 7824 | rq_attach_root(rq, &def_root_domain); | 8067 | rq_attach_root(rq, &def_root_domain); |
| @@ -8058,7 +8301,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
| 8058 | { | 8301 | { |
| 8059 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8302 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); |
| 8060 | } | 8303 | } |
| 8061 | #else | 8304 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
| 8062 | static inline void free_fair_sched_group(struct task_group *tg) | 8305 | static inline void free_fair_sched_group(struct task_group *tg) |
| 8063 | { | 8306 | { |
| 8064 | } | 8307 | } |
| @@ -8076,7 +8319,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) | |||
| 8076 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8319 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
| 8077 | { | 8320 | { |
| 8078 | } | 8321 | } |
| 8079 | #endif | 8322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 8080 | 8323 | ||
| 8081 | #ifdef CONFIG_RT_GROUP_SCHED | 8324 | #ifdef CONFIG_RT_GROUP_SCHED |
| 8082 | static void free_rt_sched_group(struct task_group *tg) | 8325 | static void free_rt_sched_group(struct task_group *tg) |
| @@ -8147,7 +8390,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
| 8147 | { | 8390 | { |
| 8148 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | 8391 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); |
| 8149 | } | 8392 | } |
| 8150 | #else | 8393 | #else /* !CONFIG_RT_GROUP_SCHED */ |
| 8151 | static inline void free_rt_sched_group(struct task_group *tg) | 8394 | static inline void free_rt_sched_group(struct task_group *tg) |
| 8152 | { | 8395 | { |
| 8153 | } | 8396 | } |
| @@ -8165,7 +8408,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) | |||
| 8165 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | 8408 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) |
| 8166 | { | 8409 | { |
| 8167 | } | 8410 | } |
| 8168 | #endif | 8411 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 8169 | 8412 | ||
| 8170 | #ifdef CONFIG_GROUP_SCHED | 8413 | #ifdef CONFIG_GROUP_SCHED |
| 8171 | static void free_sched_group(struct task_group *tg) | 8414 | static void free_sched_group(struct task_group *tg) |
| @@ -8276,17 +8519,14 @@ void sched_move_task(struct task_struct *tsk) | |||
| 8276 | 8519 | ||
| 8277 | task_rq_unlock(rq, &flags); | 8520 | task_rq_unlock(rq, &flags); |
| 8278 | } | 8521 | } |
| 8279 | #endif | 8522 | #endif /* CONFIG_GROUP_SCHED */ |
| 8280 | 8523 | ||
| 8281 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8524 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8282 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8525 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
| 8283 | { | 8526 | { |
| 8284 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8527 | struct cfs_rq *cfs_rq = se->cfs_rq; |
| 8285 | struct rq *rq = cfs_rq->rq; | ||
| 8286 | int on_rq; | 8528 | int on_rq; |
| 8287 | 8529 | ||
| 8288 | spin_lock_irq(&rq->lock); | ||
| 8289 | |||
| 8290 | on_rq = se->on_rq; | 8530 | on_rq = se->on_rq; |
| 8291 | if (on_rq) | 8531 | if (on_rq) |
| 8292 | dequeue_entity(cfs_rq, se, 0); | 8532 | dequeue_entity(cfs_rq, se, 0); |
| @@ -8296,8 +8536,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
| 8296 | 8536 | ||
| 8297 | if (on_rq) | 8537 | if (on_rq) |
| 8298 | enqueue_entity(cfs_rq, se, 0); | 8538 | enqueue_entity(cfs_rq, se, 0); |
| 8539 | } | ||
| 8299 | 8540 | ||
| 8300 | spin_unlock_irq(&rq->lock); | 8541 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
| 8542 | { | ||
| 8543 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 8544 | struct rq *rq = cfs_rq->rq; | ||
| 8545 | unsigned long flags; | ||
| 8546 | |||
| 8547 | spin_lock_irqsave(&rq->lock, flags); | ||
| 8548 | __set_se_shares(se, shares); | ||
| 8549 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8301 | } | 8550 | } |
| 8302 | 8551 | ||
| 8303 | static DEFINE_MUTEX(shares_mutex); | 8552 | static DEFINE_MUTEX(shares_mutex); |
| @@ -8336,8 +8585,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 8336 | * w/o tripping rebalance_share or load_balance_fair. | 8585 | * w/o tripping rebalance_share or load_balance_fair. |
| 8337 | */ | 8586 | */ |
| 8338 | tg->shares = shares; | 8587 | tg->shares = shares; |
| 8339 | for_each_possible_cpu(i) | 8588 | for_each_possible_cpu(i) { |
| 8589 | /* | ||
| 8590 | * force a rebalance | ||
| 8591 | */ | ||
| 8592 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
| 8340 | set_se_shares(tg->se[i], shares); | 8593 | set_se_shares(tg->se[i], shares); |
| 8594 | } | ||
| 8341 | 8595 | ||
| 8342 | /* | 8596 | /* |
| 8343 | * Enable load balance activity on this group, by inserting it back on | 8597 | * Enable load balance activity on this group, by inserting it back on |
| @@ -8376,7 +8630,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
| 8376 | #ifdef CONFIG_CGROUP_SCHED | 8630 | #ifdef CONFIG_CGROUP_SCHED |
| 8377 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8631 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| 8378 | { | 8632 | { |
| 8379 | struct task_group *tgi, *parent = tg ? tg->parent : NULL; | 8633 | struct task_group *tgi, *parent = tg->parent; |
| 8380 | unsigned long total = 0; | 8634 | unsigned long total = 0; |
| 8381 | 8635 | ||
| 8382 | if (!parent) { | 8636 | if (!parent) { |
| @@ -8400,7 +8654,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
| 8400 | } | 8654 | } |
| 8401 | rcu_read_unlock(); | 8655 | rcu_read_unlock(); |
| 8402 | 8656 | ||
| 8403 | return total + to_ratio(period, runtime) < | 8657 | return total + to_ratio(period, runtime) <= |
| 8404 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8658 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), |
| 8405 | parent->rt_bandwidth.rt_runtime); | 8659 | parent->rt_bandwidth.rt_runtime); |
| 8406 | } | 8660 | } |
| @@ -8520,16 +8774,21 @@ long sched_group_rt_period(struct task_group *tg) | |||
| 8520 | 8774 | ||
| 8521 | static int sched_rt_global_constraints(void) | 8775 | static int sched_rt_global_constraints(void) |
| 8522 | { | 8776 | { |
| 8777 | struct task_group *tg = &root_task_group; | ||
| 8778 | u64 rt_runtime, rt_period; | ||
| 8523 | int ret = 0; | 8779 | int ret = 0; |
| 8524 | 8780 | ||
| 8781 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 8782 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8783 | |||
| 8525 | mutex_lock(&rt_constraints_mutex); | 8784 | mutex_lock(&rt_constraints_mutex); |
| 8526 | if (!__rt_schedulable(NULL, 1, 0)) | 8785 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) |
| 8527 | ret = -EINVAL; | 8786 | ret = -EINVAL; |
| 8528 | mutex_unlock(&rt_constraints_mutex); | 8787 | mutex_unlock(&rt_constraints_mutex); |
| 8529 | 8788 | ||
| 8530 | return ret; | 8789 | return ret; |
| 8531 | } | 8790 | } |
| 8532 | #else | 8791 | #else /* !CONFIG_RT_GROUP_SCHED */ |
| 8533 | static int sched_rt_global_constraints(void) | 8792 | static int sched_rt_global_constraints(void) |
| 8534 | { | 8793 | { |
| 8535 | unsigned long flags; | 8794 | unsigned long flags; |
| @@ -8547,7 +8806,7 @@ static int sched_rt_global_constraints(void) | |||
| 8547 | 8806 | ||
| 8548 | return 0; | 8807 | return 0; |
| 8549 | } | 8808 | } |
| 8550 | #endif | 8809 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 8551 | 8810 | ||
| 8552 | int sched_rt_handler(struct ctl_table *table, int write, | 8811 | int sched_rt_handler(struct ctl_table *table, int write, |
| 8553 | struct file *filp, void __user *buffer, size_t *lenp, | 8812 | struct file *filp, void __user *buffer, size_t *lenp, |
| @@ -8655,7 +8914,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
| 8655 | 8914 | ||
| 8656 | return (u64) tg->shares; | 8915 | return (u64) tg->shares; |
| 8657 | } | 8916 | } |
| 8658 | #endif | 8917 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 8659 | 8918 | ||
| 8660 | #ifdef CONFIG_RT_GROUP_SCHED | 8919 | #ifdef CONFIG_RT_GROUP_SCHED |
| 8661 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 8920 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
| @@ -8679,7 +8938,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
| 8679 | { | 8938 | { |
| 8680 | return sched_group_rt_period(cgroup_tg(cgrp)); | 8939 | return sched_group_rt_period(cgroup_tg(cgrp)); |
| 8681 | } | 8940 | } |
| 8682 | #endif | 8941 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 8683 | 8942 | ||
| 8684 | static struct cftype cpu_files[] = { | 8943 | static struct cftype cpu_files[] = { |
| 8685 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8944 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219ab..22ed55d1167f 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
| @@ -3,6 +3,9 @@ | |||
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
| 5 | * | 5 | * |
| 6 | * Updates and enhancements: | ||
| 7 | * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> | ||
| 8 | * | ||
| 6 | * Based on code by: | 9 | * Based on code by: |
| 7 | * Ingo Molnar <mingo@redhat.com> | 10 | * Ingo Molnar <mingo@redhat.com> |
| 8 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
| @@ -32,6 +35,11 @@ | |||
| 32 | 35 | ||
| 33 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 36 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
| 34 | 37 | ||
| 38 | #define MULTI_SHIFT 15 | ||
| 39 | /* Max is double, Min is 1/2 */ | ||
| 40 | #define MAX_MULTI (2LL << MULTI_SHIFT) | ||
| 41 | #define MIN_MULTI (1LL << (MULTI_SHIFT-1)) | ||
| 42 | |||
| 35 | struct sched_clock_data { | 43 | struct sched_clock_data { |
| 36 | /* | 44 | /* |
| 37 | * Raw spinlock - this is a special case: this might be called | 45 | * Raw spinlock - this is a special case: this might be called |
| @@ -40,11 +48,15 @@ struct sched_clock_data { | |||
| 40 | */ | 48 | */ |
| 41 | raw_spinlock_t lock; | 49 | raw_spinlock_t lock; |
| 42 | 50 | ||
| 43 | unsigned long prev_jiffies; | 51 | unsigned long tick_jiffies; |
| 44 | u64 prev_raw; | 52 | u64 prev_raw; |
| 45 | u64 tick_raw; | 53 | u64 tick_raw; |
| 46 | u64 tick_gtod; | 54 | u64 tick_gtod; |
| 47 | u64 clock; | 55 | u64 clock; |
| 56 | s64 multi; | ||
| 57 | #ifdef CONFIG_NO_HZ | ||
| 58 | int check_max; | ||
| 59 | #endif | ||
| 48 | }; | 60 | }; |
| 49 | 61 | ||
| 50 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | 62 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); |
| @@ -71,41 +83,91 @@ void sched_clock_init(void) | |||
| 71 | struct sched_clock_data *scd = cpu_sdc(cpu); | 83 | struct sched_clock_data *scd = cpu_sdc(cpu); |
| 72 | 84 | ||
| 73 | scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | 85 | scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; |
| 74 | scd->prev_jiffies = now_jiffies; | 86 | scd->tick_jiffies = now_jiffies; |
| 75 | scd->prev_raw = 0; | 87 | scd->prev_raw = 0; |
| 76 | scd->tick_raw = 0; | 88 | scd->tick_raw = 0; |
| 77 | scd->tick_gtod = ktime_now; | 89 | scd->tick_gtod = ktime_now; |
| 78 | scd->clock = ktime_now; | 90 | scd->clock = ktime_now; |
| 91 | scd->multi = 1 << MULTI_SHIFT; | ||
| 92 | #ifdef CONFIG_NO_HZ | ||
| 93 | scd->check_max = 1; | ||
| 94 | #endif | ||
| 79 | } | 95 | } |
| 80 | 96 | ||
| 81 | sched_clock_running = 1; | 97 | sched_clock_running = 1; |
| 82 | } | 98 | } |
| 83 | 99 | ||
| 100 | #ifdef CONFIG_NO_HZ | ||
| 101 | /* | ||
| 102 | * The dynamic ticks makes the delta jiffies inaccurate. This | ||
| 103 | * prevents us from checking the maximum time update. | ||
| 104 | * Disable the maximum check during stopped ticks. | ||
| 105 | */ | ||
| 106 | void sched_clock_tick_stop(int cpu) | ||
| 107 | { | ||
| 108 | struct sched_clock_data *scd = cpu_sdc(cpu); | ||
| 109 | |||
| 110 | scd->check_max = 0; | ||
| 111 | } | ||
| 112 | |||
| 113 | void sched_clock_tick_start(int cpu) | ||
| 114 | { | ||
| 115 | struct sched_clock_data *scd = cpu_sdc(cpu); | ||
| 116 | |||
| 117 | scd->check_max = 1; | ||
| 118 | } | ||
| 119 | |||
| 120 | static int check_max(struct sched_clock_data *scd) | ||
| 121 | { | ||
| 122 | return scd->check_max; | ||
| 123 | } | ||
| 124 | #else | ||
| 125 | static int check_max(struct sched_clock_data *scd) | ||
| 126 | { | ||
| 127 | return 1; | ||
| 128 | } | ||
| 129 | #endif /* CONFIG_NO_HZ */ | ||
| 130 | |||
| 84 | /* | 131 | /* |
| 85 | * update the percpu scd from the raw @now value | 132 | * update the percpu scd from the raw @now value |
| 86 | * | 133 | * |
| 87 | * - filter out backward motion | 134 | * - filter out backward motion |
| 88 | * - use jiffies to generate a min,max window to clip the raw values | 135 | * - use jiffies to generate a min,max window to clip the raw values |
| 89 | */ | 136 | */ |
| 90 | static void __update_sched_clock(struct sched_clock_data *scd, u64 now) | 137 | static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) |
| 91 | { | 138 | { |
| 92 | unsigned long now_jiffies = jiffies; | 139 | unsigned long now_jiffies = jiffies; |
| 93 | long delta_jiffies = now_jiffies - scd->prev_jiffies; | 140 | long delta_jiffies = now_jiffies - scd->tick_jiffies; |
| 94 | u64 clock = scd->clock; | 141 | u64 clock = scd->clock; |
| 95 | u64 min_clock, max_clock; | 142 | u64 min_clock, max_clock; |
| 96 | s64 delta = now - scd->prev_raw; | 143 | s64 delta = now - scd->prev_raw; |
| 97 | 144 | ||
| 98 | WARN_ON_ONCE(!irqs_disabled()); | 145 | WARN_ON_ONCE(!irqs_disabled()); |
| 99 | min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; | 146 | |
| 147 | /* | ||
| 148 | * At schedule tick the clock can be just under the gtod. We don't | ||
| 149 | * want to push it too prematurely. | ||
| 150 | */ | ||
| 151 | min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); | ||
| 152 | if (min_clock > TICK_NSEC) | ||
| 153 | min_clock -= TICK_NSEC / 2; | ||
| 100 | 154 | ||
| 101 | if (unlikely(delta < 0)) { | 155 | if (unlikely(delta < 0)) { |
| 102 | clock++; | 156 | clock++; |
| 103 | goto out; | 157 | goto out; |
| 104 | } | 158 | } |
| 105 | 159 | ||
| 106 | max_clock = min_clock + TICK_NSEC; | 160 | /* |
| 161 | * The clock must stay within a jiffie of the gtod. | ||
| 162 | * But since we may be at the start of a jiffy or the end of one | ||
| 163 | * we add another jiffy buffer. | ||
| 164 | */ | ||
| 165 | max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; | ||
| 166 | |||
| 167 | delta *= scd->multi; | ||
| 168 | delta >>= MULTI_SHIFT; | ||
| 107 | 169 | ||
| 108 | if (unlikely(clock + delta > max_clock)) { | 170 | if (unlikely(clock + delta > max_clock) && check_max(scd)) { |
| 109 | if (clock < max_clock) | 171 | if (clock < max_clock) |
| 110 | clock = max_clock; | 172 | clock = max_clock; |
| 111 | else | 173 | else |
| @@ -118,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) | |||
| 118 | if (unlikely(clock < min_clock)) | 180 | if (unlikely(clock < min_clock)) |
| 119 | clock = min_clock; | 181 | clock = min_clock; |
| 120 | 182 | ||
| 121 | scd->prev_raw = now; | 183 | if (time) |
| 122 | scd->prev_jiffies = now_jiffies; | 184 | *time = clock; |
| 123 | scd->clock = clock; | 185 | else { |
| 186 | scd->prev_raw = now; | ||
| 187 | scd->clock = clock; | ||
| 188 | } | ||
| 124 | } | 189 | } |
| 125 | 190 | ||
| 126 | static void lock_double_clock(struct sched_clock_data *data1, | 191 | static void lock_double_clock(struct sched_clock_data *data1, |
| @@ -160,25 +225,30 @@ u64 sched_clock_cpu(int cpu) | |||
| 160 | now -= my_scd->tick_raw; | 225 | now -= my_scd->tick_raw; |
| 161 | now += scd->tick_raw; | 226 | now += scd->tick_raw; |
| 162 | 227 | ||
| 163 | now -= my_scd->tick_gtod; | 228 | now += my_scd->tick_gtod; |
| 164 | now += scd->tick_gtod; | 229 | now -= scd->tick_gtod; |
| 165 | 230 | ||
| 166 | __raw_spin_unlock(&my_scd->lock); | 231 | __raw_spin_unlock(&my_scd->lock); |
| 232 | |||
| 233 | __update_sched_clock(scd, now, &clock); | ||
| 234 | |||
| 235 | __raw_spin_unlock(&scd->lock); | ||
| 236 | |||
| 167 | } else { | 237 | } else { |
| 168 | __raw_spin_lock(&scd->lock); | 238 | __raw_spin_lock(&scd->lock); |
| 239 | __update_sched_clock(scd, now, NULL); | ||
| 240 | clock = scd->clock; | ||
| 241 | __raw_spin_unlock(&scd->lock); | ||
| 169 | } | 242 | } |
| 170 | 243 | ||
| 171 | __update_sched_clock(scd, now); | ||
| 172 | clock = scd->clock; | ||
| 173 | |||
| 174 | __raw_spin_unlock(&scd->lock); | ||
| 175 | |||
| 176 | return clock; | 244 | return clock; |
| 177 | } | 245 | } |
| 178 | 246 | ||
| 179 | void sched_clock_tick(void) | 247 | void sched_clock_tick(void) |
| 180 | { | 248 | { |
| 181 | struct sched_clock_data *scd = this_scd(); | 249 | struct sched_clock_data *scd = this_scd(); |
| 250 | unsigned long now_jiffies = jiffies; | ||
| 251 | s64 mult, delta_gtod, delta_raw; | ||
| 182 | u64 now, now_gtod; | 252 | u64 now, now_gtod; |
| 183 | 253 | ||
| 184 | if (unlikely(!sched_clock_running)) | 254 | if (unlikely(!sched_clock_running)) |
| @@ -186,18 +256,33 @@ void sched_clock_tick(void) | |||
| 186 | 256 | ||
| 187 | WARN_ON_ONCE(!irqs_disabled()); | 257 | WARN_ON_ONCE(!irqs_disabled()); |
| 188 | 258 | ||
| 189 | now = sched_clock(); | ||
| 190 | now_gtod = ktime_to_ns(ktime_get()); | 259 | now_gtod = ktime_to_ns(ktime_get()); |
| 260 | now = sched_clock(); | ||
| 191 | 261 | ||
| 192 | __raw_spin_lock(&scd->lock); | 262 | __raw_spin_lock(&scd->lock); |
| 193 | __update_sched_clock(scd, now); | 263 | __update_sched_clock(scd, now, NULL); |
| 194 | /* | 264 | /* |
| 195 | * update tick_gtod after __update_sched_clock() because that will | 265 | * update tick_gtod after __update_sched_clock() because that will |
| 196 | * already observe 1 new jiffy; adding a new tick_gtod to that would | 266 | * already observe 1 new jiffy; adding a new tick_gtod to that would |
| 197 | * increase the clock 2 jiffies. | 267 | * increase the clock 2 jiffies. |
| 198 | */ | 268 | */ |
| 269 | delta_gtod = now_gtod - scd->tick_gtod; | ||
| 270 | delta_raw = now - scd->tick_raw; | ||
| 271 | |||
| 272 | if ((long)delta_raw > 0) { | ||
| 273 | mult = delta_gtod << MULTI_SHIFT; | ||
| 274 | do_div(mult, delta_raw); | ||
| 275 | scd->multi = mult; | ||
| 276 | if (scd->multi > MAX_MULTI) | ||
| 277 | scd->multi = MAX_MULTI; | ||
| 278 | else if (scd->multi < MIN_MULTI) | ||
| 279 | scd->multi = MIN_MULTI; | ||
| 280 | } else | ||
| 281 | scd->multi = 1 << MULTI_SHIFT; | ||
| 282 | |||
| 199 | scd->tick_raw = now; | 283 | scd->tick_raw = now; |
| 200 | scd->tick_gtod = now_gtod; | 284 | scd->tick_gtod = now_gtod; |
| 285 | scd->tick_jiffies = now_jiffies; | ||
| 201 | __raw_spin_unlock(&scd->lock); | 286 | __raw_spin_unlock(&scd->lock); |
| 202 | } | 287 | } |
| 203 | 288 | ||
| @@ -227,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
| 227 | __raw_spin_lock(&scd->lock); | 312 | __raw_spin_lock(&scd->lock); |
| 228 | scd->prev_raw = now; | 313 | scd->prev_raw = now; |
| 229 | scd->clock += delta_ns; | 314 | scd->clock += delta_ns; |
| 315 | scd->multi = 1 << MULTI_SHIFT; | ||
| 230 | __raw_spin_unlock(&scd->lock); | 316 | __raw_spin_unlock(&scd->lock); |
| 231 | 317 | ||
| 232 | touch_softlockup_watchdog(); | 318 | touch_softlockup_watchdog(); |
| @@ -244,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 244 | { | 330 | { |
| 245 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); | 331 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); |
| 246 | } | 332 | } |
| 333 | |||
| 334 | unsigned long long cpu_clock(int cpu) | ||
| 335 | { | ||
| 336 | unsigned long long clock; | ||
| 337 | unsigned long flags; | ||
| 338 | |||
| 339 | local_irq_save(flags); | ||
| 340 | clock = sched_clock_cpu(cpu); | ||
| 341 | local_irq_restore(flags); | ||
| 342 | |||
| 343 | return clock; | ||
| 344 | } | ||
| 345 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 000000000000..52154fefab7e --- /dev/null +++ b/kernel/sched_cpupri.c | |||
| @@ -0,0 +1,174 @@ | |||
| 1 | /* | ||
| 2 | * kernel/sched_cpupri.c | ||
| 3 | * | ||
| 4 | * CPU priority management | ||
| 5 | * | ||
| 6 | * Copyright (C) 2007-2008 Novell | ||
| 7 | * | ||
| 8 | * Author: Gregory Haskins <ghaskins@novell.com> | ||
| 9 | * | ||
| 10 | * This code tracks the priority of each CPU so that global migration | ||
| 11 | * decisions are easy to calculate. Each CPU can be in a state as follows: | ||
| 12 | * | ||
| 13 | * (INVALID), IDLE, NORMAL, RT1, ... RT99 | ||
| 14 | * | ||
| 15 | * going from the lowest priority to the highest. CPUs in the INVALID state | ||
| 16 | * are not eligible for routing. The system maintains this state with | ||
| 17 | * a 2 dimensional bitmap (the first for priority class, the second for cpus | ||
| 18 | * in that class). Therefore a typical application without affinity | ||
| 19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | ||
| 20 | * searches). For tasks with affinity restrictions, the algorithm has a | ||
| 21 | * worst case complexity of O(min(102, nr_domcpus)), though the scenario that | ||
| 22 | * yields the worst case search is fairly contrived. | ||
| 23 | * | ||
| 24 | * This program is free software; you can redistribute it and/or | ||
| 25 | * modify it under the terms of the GNU General Public License | ||
| 26 | * as published by the Free Software Foundation; version 2 | ||
| 27 | * of the License. | ||
| 28 | */ | ||
| 29 | |||
| 30 | #include "sched_cpupri.h" | ||
| 31 | |||
| 32 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | ||
| 33 | static int convert_prio(int prio) | ||
| 34 | { | ||
| 35 | int cpupri; | ||
| 36 | |||
| 37 | if (prio == CPUPRI_INVALID) | ||
| 38 | cpupri = CPUPRI_INVALID; | ||
| 39 | else if (prio == MAX_PRIO) | ||
| 40 | cpupri = CPUPRI_IDLE; | ||
| 41 | else if (prio >= MAX_RT_PRIO) | ||
| 42 | cpupri = CPUPRI_NORMAL; | ||
| 43 | else | ||
| 44 | cpupri = MAX_RT_PRIO - prio + 1; | ||
| 45 | |||
| 46 | return cpupri; | ||
| 47 | } | ||
| 48 | |||
| 49 | #define for_each_cpupri_active(array, idx) \ | ||
| 50 | for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ | ||
| 51 | idx < CPUPRI_NR_PRIORITIES; \ | ||
| 52 | idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) | ||
| 53 | |||
| 54 | /** | ||
| 55 | * cpupri_find - find the best (lowest-pri) CPU in the system | ||
| 56 | * @cp: The cpupri context | ||
| 57 | * @p: The task | ||
| 58 | * @lowest_mask: A mask to fill in with selected CPUs | ||
| 59 | * | ||
| 60 | * Note: This function returns the recommended CPUs as calculated during the | ||
| 61 | * current invokation. By the time the call returns, the CPUs may have in | ||
| 62 | * fact changed priorities any number of times. While not ideal, it is not | ||
| 63 | * an issue of correctness since the normal rebalancer logic will correct | ||
| 64 | * any discrepancies created by racing against the uncertainty of the current | ||
| 65 | * priority configuration. | ||
| 66 | * | ||
| 67 | * Returns: (int)bool - CPUs were found | ||
| 68 | */ | ||
| 69 | int cpupri_find(struct cpupri *cp, struct task_struct *p, | ||
| 70 | cpumask_t *lowest_mask) | ||
| 71 | { | ||
| 72 | int idx = 0; | ||
| 73 | int task_pri = convert_prio(p->prio); | ||
| 74 | |||
| 75 | for_each_cpupri_active(cp->pri_active, idx) { | ||
| 76 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | ||
| 77 | cpumask_t mask; | ||
| 78 | |||
| 79 | if (idx >= task_pri) | ||
| 80 | break; | ||
| 81 | |||
| 82 | cpus_and(mask, p->cpus_allowed, vec->mask); | ||
| 83 | |||
| 84 | if (cpus_empty(mask)) | ||
| 85 | continue; | ||
| 86 | |||
| 87 | *lowest_mask = mask; | ||
| 88 | return 1; | ||
| 89 | } | ||
| 90 | |||
| 91 | return 0; | ||
| 92 | } | ||
| 93 | |||
| 94 | /** | ||
| 95 | * cpupri_set - update the cpu priority setting | ||
| 96 | * @cp: The cpupri context | ||
| 97 | * @cpu: The target cpu | ||
| 98 | * @pri: The priority (INVALID-RT99) to assign to this CPU | ||
| 99 | * | ||
| 100 | * Note: Assumes cpu_rq(cpu)->lock is locked | ||
| 101 | * | ||
| 102 | * Returns: (void) | ||
| 103 | */ | ||
| 104 | void cpupri_set(struct cpupri *cp, int cpu, int newpri) | ||
| 105 | { | ||
| 106 | int *currpri = &cp->cpu_to_pri[cpu]; | ||
| 107 | int oldpri = *currpri; | ||
| 108 | unsigned long flags; | ||
| 109 | |||
| 110 | newpri = convert_prio(newpri); | ||
| 111 | |||
| 112 | BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); | ||
| 113 | |||
| 114 | if (newpri == oldpri) | ||
| 115 | return; | ||
| 116 | |||
| 117 | /* | ||
| 118 | * If the cpu was currently mapped to a different value, we | ||
| 119 | * first need to unmap the old value | ||
| 120 | */ | ||
| 121 | if (likely(oldpri != CPUPRI_INVALID)) { | ||
| 122 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | ||
| 123 | |||
| 124 | spin_lock_irqsave(&vec->lock, flags); | ||
| 125 | |||
| 126 | vec->count--; | ||
| 127 | if (!vec->count) | ||
| 128 | clear_bit(oldpri, cp->pri_active); | ||
| 129 | cpu_clear(cpu, vec->mask); | ||
| 130 | |||
| 131 | spin_unlock_irqrestore(&vec->lock, flags); | ||
| 132 | } | ||
| 133 | |||
| 134 | if (likely(newpri != CPUPRI_INVALID)) { | ||
| 135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | ||
| 136 | |||
| 137 | spin_lock_irqsave(&vec->lock, flags); | ||
| 138 | |||
| 139 | cpu_set(cpu, vec->mask); | ||
| 140 | vec->count++; | ||
| 141 | if (vec->count == 1) | ||
| 142 | set_bit(newpri, cp->pri_active); | ||
| 143 | |||
| 144 | spin_unlock_irqrestore(&vec->lock, flags); | ||
| 145 | } | ||
| 146 | |||
| 147 | *currpri = newpri; | ||
| 148 | } | ||
| 149 | |||
| 150 | /** | ||
| 151 | * cpupri_init - initialize the cpupri structure | ||
| 152 | * @cp: The cpupri context | ||
| 153 | * | ||
| 154 | * Returns: (void) | ||
| 155 | */ | ||
| 156 | void cpupri_init(struct cpupri *cp) | ||
| 157 | { | ||
| 158 | int i; | ||
| 159 | |||
| 160 | memset(cp, 0, sizeof(*cp)); | ||
| 161 | |||
| 162 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | ||
| 163 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | ||
| 164 | |||
| 165 | spin_lock_init(&vec->lock); | ||
| 166 | vec->count = 0; | ||
| 167 | cpus_clear(vec->mask); | ||
| 168 | } | ||
| 169 | |||
| 170 | for_each_possible_cpu(i) | ||
| 171 | cp->cpu_to_pri[i] = CPUPRI_INVALID; | ||
| 172 | } | ||
| 173 | |||
| 174 | |||
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 000000000000..f25811b0f931 --- /dev/null +++ b/kernel/sched_cpupri.h | |||
| @@ -0,0 +1,36 @@ | |||
| 1 | #ifndef _LINUX_CPUPRI_H | ||
| 2 | #define _LINUX_CPUPRI_H | ||
| 3 | |||
| 4 | #include <linux/sched.h> | ||
| 5 | |||
| 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | ||
| 7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
| 8 | |||
| 9 | #define CPUPRI_INVALID -1 | ||
| 10 | #define CPUPRI_IDLE 0 | ||
| 11 | #define CPUPRI_NORMAL 1 | ||
| 12 | /* values 2-101 are RT priorities 0-99 */ | ||
| 13 | |||
| 14 | struct cpupri_vec { | ||
| 15 | spinlock_t lock; | ||
| 16 | int count; | ||
| 17 | cpumask_t mask; | ||
| 18 | }; | ||
| 19 | |||
| 20 | struct cpupri { | ||
| 21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | ||
| 22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
| 23 | int cpu_to_pri[NR_CPUS]; | ||
| 24 | }; | ||
| 25 | |||
| 26 | #ifdef CONFIG_SMP | ||
| 27 | int cpupri_find(struct cpupri *cp, | ||
| 28 | struct task_struct *p, cpumask_t *lowest_mask); | ||
| 29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | ||
| 30 | void cpupri_init(struct cpupri *cp); | ||
| 31 | #else | ||
| 32 | #define cpupri_set(cp, cpu, pri) do { } while (0) | ||
| 33 | #define cpupri_init() do { } while (0) | ||
| 34 | #endif | ||
| 35 | |||
| 36 | #endif /* _LINUX_CPUPRI_H */ | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8bb713040ac9..bbe6b31c3c56 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 119 | struct sched_entity *last; | 119 | struct sched_entity *last; |
| 120 | unsigned long flags; | 120 | unsigned long flags; |
| 121 | 121 | ||
| 122 | #if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) | 122 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) |
| 123 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
| 124 | #else | ||
| 125 | char path[128] = ""; | 123 | char path[128] = ""; |
| 126 | struct cgroup *cgroup = NULL; | 124 | struct cgroup *cgroup = NULL; |
| 127 | struct task_group *tg = cfs_rq->tg; | 125 | struct task_group *tg = cfs_rq->tg; |
| @@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 133 | cgroup_path(cgroup, path, sizeof(path)); | 131 | cgroup_path(cgroup, path, sizeof(path)); |
| 134 | 132 | ||
| 135 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | 133 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); |
| 134 | #else | ||
| 135 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
| 136 | #endif | 136 | #endif |
| 137 | 137 | ||
| 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
| @@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 162 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 162 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); |
| 163 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 163 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
| 164 | #ifdef CONFIG_SCHEDSTATS | 164 | #ifdef CONFIG_SCHEDSTATS |
| 165 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", | 165 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); |
| 166 | rq->bkl_count); | 166 | |
| 167 | P(yld_exp_empty); | ||
| 168 | P(yld_act_empty); | ||
| 169 | P(yld_both_empty); | ||
| 170 | P(yld_count); | ||
| 171 | |||
| 172 | P(sched_switch); | ||
| 173 | P(sched_count); | ||
| 174 | P(sched_goidle); | ||
| 175 | |||
| 176 | P(ttwu_count); | ||
| 177 | P(ttwu_local); | ||
| 178 | |||
| 179 | P(bkl_count); | ||
| 180 | |||
| 181 | #undef P | ||
| 167 | #endif | 182 | #endif |
| 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 183 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
| 169 | cfs_rq->nr_spread_over); | 184 | cfs_rq->nr_spread_over); |
| 185 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 186 | #ifdef CONFIG_SMP | ||
| 187 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
| 188 | #endif | ||
| 189 | #endif | ||
| 190 | } | ||
| 191 | |||
| 192 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | ||
| 193 | { | ||
| 194 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) | ||
| 195 | char path[128] = ""; | ||
| 196 | struct cgroup *cgroup = NULL; | ||
| 197 | struct task_group *tg = rt_rq->tg; | ||
| 198 | |||
| 199 | if (tg) | ||
| 200 | cgroup = tg->css.cgroup; | ||
| 201 | |||
| 202 | if (cgroup) | ||
| 203 | cgroup_path(cgroup, path, sizeof(path)); | ||
| 204 | |||
| 205 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); | ||
| 206 | #else | ||
| 207 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | ||
| 208 | #endif | ||
| 209 | |||
| 210 | |||
| 211 | #define P(x) \ | ||
| 212 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | ||
| 213 | #define PN(x) \ | ||
| 214 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) | ||
| 215 | |||
| 216 | P(rt_nr_running); | ||
| 217 | P(rt_throttled); | ||
| 218 | PN(rt_time); | ||
| 219 | PN(rt_runtime); | ||
| 220 | |||
| 221 | #undef PN | ||
| 222 | #undef P | ||
| 170 | } | 223 | } |
| 171 | 224 | ||
| 172 | static void print_cpu(struct seq_file *m, int cpu) | 225 | static void print_cpu(struct seq_file *m, int cpu) |
| @@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
| 208 | #undef PN | 261 | #undef PN |
| 209 | 262 | ||
| 210 | print_cfs_stats(m, cpu); | 263 | print_cfs_stats(m, cpu); |
| 264 | print_rt_stats(m, cpu); | ||
| 211 | 265 | ||
| 212 | print_rq(m, rq, cpu); | 266 | print_rq(m, rq, cpu); |
| 213 | } | 267 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ae848b71d4..f2aa987027d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
| 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 67 | * | 67 | * |
| 68 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
| 69 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
| 70 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
| 71 | */ | 71 | */ |
| 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; |
| 73 | 73 | ||
| 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 75 | 75 | ||
| @@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
| 334 | #endif | 334 | #endif |
| 335 | 335 | ||
| 336 | /* | 336 | /* |
| 337 | * delta *= w / rw | ||
| 338 | */ | ||
| 339 | static inline unsigned long | ||
| 340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
| 341 | { | ||
| 342 | for_each_sched_entity(se) { | ||
| 343 | delta = calc_delta_mine(delta, | ||
| 344 | se->load.weight, &cfs_rq_of(se)->load); | ||
| 345 | } | ||
| 346 | |||
| 347 | return delta; | ||
| 348 | } | ||
| 349 | |||
| 350 | /* | ||
| 351 | * delta *= rw / w | ||
| 352 | */ | ||
| 353 | static inline unsigned long | ||
| 354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
| 355 | { | ||
| 356 | for_each_sched_entity(se) { | ||
| 357 | delta = calc_delta_mine(delta, | ||
| 358 | cfs_rq_of(se)->load.weight, &se->load); | ||
| 359 | } | ||
| 360 | |||
| 361 | return delta; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 337 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
| 338 | * | 366 | * |
| 339 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
| @@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 362 | */ | 390 | */ |
| 363 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 364 | { | 392 | { |
| 365 | u64 slice = __sched_period(cfs_rq->nr_running); | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
| 366 | |||
| 367 | for_each_sched_entity(se) { | ||
| 368 | cfs_rq = cfs_rq_of(se); | ||
| 369 | |||
| 370 | slice *= se->load.weight; | ||
| 371 | do_div(slice, cfs_rq->load.weight); | ||
| 372 | } | ||
| 373 | |||
| 374 | |||
| 375 | return slice; | ||
| 376 | } | 394 | } |
| 377 | 395 | ||
| 378 | /* | 396 | /* |
| 379 | * We calculate the vruntime slice of a to be inserted task | 397 | * We calculate the vruntime slice of a to be inserted task |
| 380 | * | 398 | * |
| 381 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
| 382 | */ | 400 | */ |
| 383 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 384 | { | 402 | { |
| 385 | unsigned long nr_running = cfs_rq->nr_running; | 403 | unsigned long nr_running = cfs_rq->nr_running; |
| 386 | unsigned long weight; | ||
| 387 | u64 vslice; | ||
| 388 | 404 | ||
| 389 | if (!se->on_rq) | 405 | if (!se->on_rq) |
| 390 | nr_running++; | 406 | nr_running++; |
| 391 | 407 | ||
| 392 | vslice = __sched_period(nr_running); | 408 | return __sched_period(nr_running); |
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
| 413 | * that it favours >=0 over <0. | ||
| 414 | * | ||
| 415 | * -20 | | ||
| 416 | * | | ||
| 417 | * 0 --------+------- | ||
| 418 | * .' | ||
| 419 | * 19 .' | ||
| 420 | * | ||
| 421 | */ | ||
| 422 | static unsigned long | ||
| 423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
| 424 | { | ||
| 425 | struct load_weight lw = { | ||
| 426 | .weight = NICE_0_LOAD, | ||
| 427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
| 428 | }; | ||
| 393 | 429 | ||
| 394 | for_each_sched_entity(se) { | 430 | for_each_sched_entity(se) { |
| 395 | cfs_rq = cfs_rq_of(se); | 431 | struct load_weight *se_lw = &se->load; |
| 432 | unsigned long rw = cfs_rq_of(se)->load.weight; | ||
| 433 | |||
| 434 | #ifdef CONFIG_FAIR_SCHED_GROUP | ||
| 435 | struct cfs_rq *cfs_rq = se->my_q; | ||
| 436 | struct task_group *tg = NULL | ||
| 437 | |||
| 438 | if (cfs_rq) | ||
| 439 | tg = cfs_rq->tg; | ||
| 440 | |||
| 441 | if (tg && tg->shares < NICE_0_LOAD) { | ||
| 442 | /* | ||
| 443 | * scale shares to what it would have been had | ||
| 444 | * tg->weight been NICE_0_LOAD: | ||
| 445 | * | ||
| 446 | * weight = 1024 * shares / tg->weight | ||
| 447 | */ | ||
| 448 | lw.weight *= se->load.weight; | ||
| 449 | lw.weight /= tg->shares; | ||
| 450 | |||
| 451 | lw.inv_weight = 0; | ||
| 452 | |||
| 453 | se_lw = &lw; | ||
| 454 | rw += lw.weight - se->load.weight; | ||
| 455 | } else | ||
| 456 | #endif | ||
| 396 | 457 | ||
| 397 | weight = cfs_rq->load.weight; | 458 | if (se->load.weight < NICE_0_LOAD) { |
| 398 | if (!se->on_rq) | 459 | se_lw = &lw; |
| 399 | weight += se->load.weight; | 460 | rw += NICE_0_LOAD - se->load.weight; |
| 461 | } | ||
| 400 | 462 | ||
| 401 | vslice *= NICE_0_LOAD; | 463 | delta = calc_delta_mine(delta, rw, se_lw); |
| 402 | do_div(vslice, weight); | ||
| 403 | } | 464 | } |
| 404 | 465 | ||
| 405 | return vslice; | 466 | return delta; |
| 406 | } | 467 | } |
| 407 | 468 | ||
| 408 | /* | 469 | /* |
| @@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 419 | 480 | ||
| 420 | curr->sum_exec_runtime += delta_exec; | 481 | curr->sum_exec_runtime += delta_exec; |
| 421 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 482 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
| 422 | delta_exec_weighted = delta_exec; | 483 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
| 423 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
| 424 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
| 425 | &curr->load); | ||
| 426 | } | ||
| 427 | curr->vruntime += delta_exec_weighted; | 484 | curr->vruntime += delta_exec_weighted; |
| 428 | } | 485 | } |
| 429 | 486 | ||
| @@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 510 | * Scheduling class queueing methods: | 567 | * Scheduling class queueing methods: |
| 511 | */ | 568 | */ |
| 512 | 569 | ||
| 570 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
| 571 | static void | ||
| 572 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 573 | { | ||
| 574 | cfs_rq->task_weight += weight; | ||
| 575 | } | ||
| 576 | #else | ||
| 577 | static inline void | ||
| 578 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 579 | { | ||
| 580 | } | ||
| 581 | #endif | ||
| 582 | |||
| 513 | static void | 583 | static void |
| 514 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 584 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 515 | { | 585 | { |
| 516 | update_load_add(&cfs_rq->load, se->load.weight); | 586 | update_load_add(&cfs_rq->load, se->load.weight); |
| 587 | if (!parent_entity(se)) | ||
| 588 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 589 | if (entity_is_task(se)) | ||
| 590 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
| 517 | cfs_rq->nr_running++; | 591 | cfs_rq->nr_running++; |
| 518 | se->on_rq = 1; | 592 | se->on_rq = 1; |
| 519 | list_add(&se->group_node, &cfs_rq->tasks); | 593 | list_add(&se->group_node, &cfs_rq->tasks); |
| @@ -523,6 +597,10 @@ static void | |||
| 523 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 597 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 524 | { | 598 | { |
| 525 | update_load_sub(&cfs_rq->load, se->load.weight); | 599 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 600 | if (!parent_entity(se)) | ||
| 601 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 602 | if (entity_is_task(se)) | ||
| 603 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
| 526 | cfs_rq->nr_running--; | 604 | cfs_rq->nr_running--; |
| 527 | se->on_rq = 0; | 605 | se->on_rq = 0; |
| 528 | list_del_init(&se->group_node); | 606 | list_del_init(&se->group_node); |
| @@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 609 | 687 | ||
| 610 | if (!initial) { | 688 | if (!initial) { |
| 611 | /* sleeps upto a single latency don't count. */ | 689 | /* sleeps upto a single latency don't count. */ |
| 612 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 690 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
| 613 | vruntime -= sysctl_sched_latency; | 691 | unsigned long thresh = sysctl_sched_latency; |
| 692 | |||
| 693 | /* | ||
| 694 | * convert the sleeper threshold into virtual time | ||
| 695 | */ | ||
| 696 | if (sched_feat(NORMALIZED_SLEEPER)) | ||
| 697 | thresh = calc_delta_fair(thresh, se); | ||
| 698 | |||
| 699 | vruntime -= thresh; | ||
| 700 | } | ||
| 614 | 701 | ||
| 615 | /* ensure we never gain time by being placed backwards. */ | 702 | /* ensure we never gain time by being placed backwards. */ |
| 616 | vruntime = max_vruntime(se->vruntime, vruntime); | 703 | vruntime = max_vruntime(se->vruntime, vruntime); |
| @@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | |||
| 639 | __enqueue_entity(cfs_rq, se); | 726 | __enqueue_entity(cfs_rq, se); |
| 640 | } | 727 | } |
| 641 | 728 | ||
| 642 | static void update_avg(u64 *avg, u64 sample) | ||
| 643 | { | ||
| 644 | s64 diff = sample - *avg; | ||
| 645 | *avg += diff >> 3; | ||
| 646 | } | ||
| 647 | |||
| 648 | static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 649 | { | ||
| 650 | if (!se->last_wakeup) | ||
| 651 | return; | ||
| 652 | |||
| 653 | update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); | ||
| 654 | se->last_wakeup = 0; | ||
| 655 | } | ||
| 656 | |||
| 657 | static void | 729 | static void |
| 658 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 730 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
| 659 | { | 731 | { |
| @@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
| 664 | 736 | ||
| 665 | update_stats_dequeue(cfs_rq, se); | 737 | update_stats_dequeue(cfs_rq, se); |
| 666 | if (sleep) { | 738 | if (sleep) { |
| 667 | update_avg_stats(cfs_rq, se); | ||
| 668 | #ifdef CONFIG_SCHEDSTATS | 739 | #ifdef CONFIG_SCHEDSTATS |
| 669 | if (entity_is_task(se)) { | 740 | if (entity_is_task(se)) { |
| 670 | struct task_struct *tsk = task_of(se); | 741 | struct task_struct *tsk = task_of(se); |
| @@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 726 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 797 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 727 | } | 798 | } |
| 728 | 799 | ||
| 729 | static int | ||
| 730 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
| 731 | |||
| 732 | static struct sched_entity * | 800 | static struct sched_entity * |
| 733 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 801 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 734 | { | 802 | { |
| 735 | if (!cfs_rq->next) | 803 | struct rq *rq = rq_of(cfs_rq); |
| 736 | return se; | 804 | u64 pair_slice = rq->clock - cfs_rq->pair_start; |
| 737 | 805 | ||
| 738 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) | 806 | if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { |
| 807 | cfs_rq->pair_start = rq->clock; | ||
| 739 | return se; | 808 | return se; |
| 809 | } | ||
| 740 | 810 | ||
| 741 | return cfs_rq->next; | 811 | return cfs_rq->next; |
| 742 | } | 812 | } |
| @@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 835 | hrtick_start(rq, delta, requeue); | 905 | hrtick_start(rq, delta, requeue); |
| 836 | } | 906 | } |
| 837 | } | 907 | } |
| 838 | #else | 908 | #else /* !CONFIG_SCHED_HRTICK */ |
| 839 | static inline void | 909 | static inline void |
| 840 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | 910 | hrtick_start_fair(struct rq *rq, struct task_struct *p) |
| 841 | { | 911 | { |
| @@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 976 | } | 1046 | } |
| 977 | return cpu; | 1047 | return cpu; |
| 978 | } | 1048 | } |
| 979 | #else | 1049 | #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ |
| 980 | static inline int wake_idle(int cpu, struct task_struct *p) | 1050 | static inline int wake_idle(int cpu, struct task_struct *p) |
| 981 | { | 1051 | { |
| 982 | return cpu; | 1052 | return cpu; |
| @@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
| 987 | 1057 | ||
| 988 | static const struct sched_class fair_sched_class; | 1058 | static const struct sched_class fair_sched_class; |
| 989 | 1059 | ||
| 1060 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1061 | /* | ||
| 1062 | * effective_load() calculates the load change as seen from the root_task_group | ||
| 1063 | * | ||
| 1064 | * Adding load to a group doesn't make a group heavier, but can cause movement | ||
| 1065 | * of group shares between cpus. Assuming the shares were perfectly aligned one | ||
| 1066 | * can calculate the shift in shares. | ||
| 1067 | * | ||
| 1068 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
| 1069 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
| 1070 | * this change. | ||
| 1071 | * | ||
| 1072 | * We compensate this by not only taking the current delta into account, but | ||
| 1073 | * also considering the delta between when the shares were last adjusted and | ||
| 1074 | * now. | ||
| 1075 | * | ||
| 1076 | * We still saw a performance dip, some tracing learned us that between | ||
| 1077 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
| 1078 | * significantly. Therefore try to bias the error in direction of failing | ||
| 1079 | * the affine wakeup. | ||
| 1080 | * | ||
| 1081 | */ | ||
| 1082 | static long effective_load(struct task_group *tg, int cpu, | ||
| 1083 | long wl, long wg) | ||
| 1084 | { | ||
| 1085 | struct sched_entity *se = tg->se[cpu]; | ||
| 1086 | long more_w; | ||
| 1087 | |||
| 1088 | if (!tg->parent) | ||
| 1089 | return wl; | ||
| 1090 | |||
| 1091 | /* | ||
| 1092 | * By not taking the decrease of shares on the other cpu into | ||
| 1093 | * account our error leans towards reducing the affine wakeups. | ||
| 1094 | */ | ||
| 1095 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
| 1096 | return wl; | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * Instead of using this increment, also add the difference | ||
| 1100 | * between when the shares were last updated and now. | ||
| 1101 | */ | ||
| 1102 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
| 1103 | wl += more_w; | ||
| 1104 | wg += more_w; | ||
| 1105 | |||
| 1106 | for_each_sched_entity(se) { | ||
| 1107 | #define D(n) (likely(n) ? (n) : 1) | ||
| 1108 | |||
| 1109 | long S, rw, s, a, b; | ||
| 1110 | |||
| 1111 | S = se->my_q->tg->shares; | ||
| 1112 | s = se->my_q->shares; | ||
| 1113 | rw = se->my_q->rq_weight; | ||
| 1114 | |||
| 1115 | a = S*(rw + wl); | ||
| 1116 | b = S*rw + s*wg; | ||
| 1117 | |||
| 1118 | wl = s*(a-b)/D(b); | ||
| 1119 | /* | ||
| 1120 | * Assume the group is already running and will | ||
| 1121 | * thus already be accounted for in the weight. | ||
| 1122 | * | ||
| 1123 | * That is, moving shares between CPUs, does not | ||
| 1124 | * alter the group weight. | ||
| 1125 | */ | ||
| 1126 | wg = 0; | ||
| 1127 | #undef D | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | return wl; | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | #else | ||
| 1134 | |||
| 1135 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | ||
| 1136 | unsigned long wl, unsigned long wg) | ||
| 1137 | { | ||
| 1138 | return wl; | ||
| 1139 | } | ||
| 1140 | |||
| 1141 | #endif | ||
| 1142 | |||
| 990 | static int | 1143 | static int |
| 991 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | 1144 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, |
| 992 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | 1145 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, |
| @@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 994 | unsigned int imbalance) | 1147 | unsigned int imbalance) |
| 995 | { | 1148 | { |
| 996 | struct task_struct *curr = this_rq->curr; | 1149 | struct task_struct *curr = this_rq->curr; |
| 1150 | struct task_group *tg; | ||
| 997 | unsigned long tl = this_load; | 1151 | unsigned long tl = this_load; |
| 998 | unsigned long tl_per_task; | 1152 | unsigned long tl_per_task; |
| 1153 | unsigned long weight; | ||
| 999 | int balanced; | 1154 | int balanced; |
| 1000 | 1155 | ||
| 1001 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) | 1156 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) |
| @@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 1006 | * effect of the currently running task from the load | 1161 | * effect of the currently running task from the load |
| 1007 | * of the current CPU: | 1162 | * of the current CPU: |
| 1008 | */ | 1163 | */ |
| 1009 | if (sync) | 1164 | if (sync) { |
| 1010 | tl -= current->se.load.weight; | 1165 | tg = task_group(current); |
| 1166 | weight = current->se.load.weight; | ||
| 1167 | |||
| 1168 | tl += effective_load(tg, this_cpu, -weight, -weight); | ||
| 1169 | load += effective_load(tg, prev_cpu, 0, -weight); | ||
| 1170 | } | ||
| 1011 | 1171 | ||
| 1012 | balanced = 100*(tl + p->se.load.weight) <= imbalance*load; | 1172 | tg = task_group(p); |
| 1173 | weight = p->se.load.weight; | ||
| 1174 | |||
| 1175 | balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | ||
| 1176 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); | ||
| 1013 | 1177 | ||
| 1014 | /* | 1178 | /* |
| 1015 | * If the currently running task will sleep within | 1179 | * If the currently running task will sleep within |
| 1016 | * a reasonable amount of time then attract this newly | 1180 | * a reasonable amount of time then attract this newly |
| 1017 | * woken task: | 1181 | * woken task: |
| 1018 | */ | 1182 | */ |
| 1019 | if (sync && balanced && curr->sched_class == &fair_sched_class) { | 1183 | if (sync && balanced) { |
| 1020 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | 1184 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && |
| 1021 | p->se.avg_overlap < sysctl_sched_migration_cost) | 1185 | p->se.avg_overlap < sysctl_sched_migration_cost) |
| 1022 | return 1; | 1186 | return 1; |
| 1023 | } | 1187 | } |
| 1024 | 1188 | ||
| @@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se) | |||
| 1111 | unsigned long gran = sysctl_sched_wakeup_granularity; | 1275 | unsigned long gran = sysctl_sched_wakeup_granularity; |
| 1112 | 1276 | ||
| 1113 | /* | 1277 | /* |
| 1114 | * More easily preempt - nice tasks, while not making | 1278 | * More easily preempt - nice tasks, while not making it harder for |
| 1115 | * it harder for + nice tasks. | 1279 | * + nice tasks. |
| 1116 | */ | 1280 | */ |
| 1117 | if (unlikely(se->load.weight > NICE_0_LOAD)) | 1281 | if (sched_feat(ASYM_GRAN)) |
| 1118 | gran = calc_delta_fair(gran, &se->load); | 1282 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); |
| 1283 | else | ||
| 1284 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); | ||
| 1119 | 1285 | ||
| 1120 | return gran; | 1286 | return gran; |
| 1121 | } | 1287 | } |
| @@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1177 | return; | 1343 | return; |
| 1178 | } | 1344 | } |
| 1179 | 1345 | ||
| 1180 | se->last_wakeup = se->sum_exec_runtime; | ||
| 1181 | if (unlikely(se == pse)) | 1346 | if (unlikely(se == pse)) |
| 1182 | return; | 1347 | return; |
| 1183 | 1348 | ||
| @@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | |||
| 1275 | struct task_struct *p = NULL; | 1440 | struct task_struct *p = NULL; |
| 1276 | struct sched_entity *se; | 1441 | struct sched_entity *se; |
| 1277 | 1442 | ||
| 1278 | if (next == &cfs_rq->tasks) | 1443 | while (next != &cfs_rq->tasks) { |
| 1279 | return NULL; | ||
| 1280 | |||
| 1281 | /* Skip over entities that are not tasks */ | ||
| 1282 | do { | ||
| 1283 | se = list_entry(next, struct sched_entity, group_node); | 1444 | se = list_entry(next, struct sched_entity, group_node); |
| 1284 | next = next->next; | 1445 | next = next->next; |
| 1285 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
| 1286 | 1446 | ||
| 1287 | if (next == &cfs_rq->tasks) | 1447 | /* Skip over entities that are not tasks */ |
| 1288 | return NULL; | 1448 | if (entity_is_task(se)) { |
| 1449 | p = task_of(se); | ||
| 1450 | break; | ||
| 1451 | } | ||
| 1452 | } | ||
| 1289 | 1453 | ||
| 1290 | cfs_rq->balance_iterator = next; | 1454 | cfs_rq->balance_iterator = next; |
| 1291 | |||
| 1292 | if (entity_is_task(se)) | ||
| 1293 | p = task_of(se); | ||
| 1294 | |||
| 1295 | return p; | 1455 | return p; |
| 1296 | } | 1456 | } |
| 1297 | 1457 | ||
| @@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
| 1309 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1469 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
| 1310 | } | 1470 | } |
| 1311 | 1471 | ||
| 1312 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1472 | static unsigned long |
| 1313 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1473 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1474 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 1475 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
| 1476 | struct cfs_rq *cfs_rq) | ||
| 1314 | { | 1477 | { |
| 1315 | struct sched_entity *curr; | 1478 | struct rq_iterator cfs_rq_iterator; |
| 1316 | struct task_struct *p; | ||
| 1317 | |||
| 1318 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
| 1319 | return MAX_PRIO; | ||
| 1320 | |||
| 1321 | curr = cfs_rq->curr; | ||
| 1322 | if (!curr) | ||
| 1323 | curr = __pick_next_entity(cfs_rq); | ||
| 1324 | 1479 | ||
| 1325 | p = task_of(curr); | 1480 | cfs_rq_iterator.start = load_balance_start_fair; |
| 1481 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1482 | cfs_rq_iterator.arg = cfs_rq; | ||
| 1326 | 1483 | ||
| 1327 | return p->prio; | 1484 | return balance_tasks(this_rq, this_cpu, busiest, |
| 1485 | max_load_move, sd, idle, all_pinned, | ||
| 1486 | this_best_prio, &cfs_rq_iterator); | ||
| 1328 | } | 1487 | } |
| 1329 | #endif | ||
| 1330 | 1488 | ||
| 1489 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1331 | static unsigned long | 1490 | static unsigned long |
| 1332 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1491 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1333 | unsigned long max_load_move, | 1492 | unsigned long max_load_move, |
| 1334 | struct sched_domain *sd, enum cpu_idle_type idle, | 1493 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 1335 | int *all_pinned, int *this_best_prio) | 1494 | int *all_pinned, int *this_best_prio) |
| 1336 | { | 1495 | { |
| 1337 | struct cfs_rq *busy_cfs_rq; | ||
| 1338 | long rem_load_move = max_load_move; | 1496 | long rem_load_move = max_load_move; |
| 1339 | struct rq_iterator cfs_rq_iterator; | 1497 | int busiest_cpu = cpu_of(busiest); |
| 1340 | 1498 | struct task_group *tg; | |
| 1341 | cfs_rq_iterator.start = load_balance_start_fair; | ||
| 1342 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1343 | 1499 | ||
| 1344 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1500 | rcu_read_lock(); |
| 1345 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1501 | update_h_load(busiest_cpu); |
| 1346 | struct cfs_rq *this_cfs_rq; | ||
| 1347 | long imbalance; | ||
| 1348 | unsigned long maxload; | ||
| 1349 | 1502 | ||
| 1350 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1503 | list_for_each_entry(tg, &task_groups, list) { |
| 1504 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | ||
| 1505 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
| 1506 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
| 1507 | u64 rem_load, moved_load; | ||
| 1351 | 1508 | ||
| 1352 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1509 | /* |
| 1353 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1510 | * empty group |
| 1354 | if (imbalance <= 0) | 1511 | */ |
| 1512 | if (!busiest_cfs_rq->task_weight) | ||
| 1355 | continue; | 1513 | continue; |
| 1356 | 1514 | ||
| 1357 | /* Don't pull more than imbalance/2 */ | 1515 | rem_load = (u64)rem_load_move * busiest_weight; |
| 1358 | imbalance /= 2; | 1516 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
| 1359 | maxload = min(rem_load_move, imbalance); | ||
| 1360 | 1517 | ||
| 1361 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1518 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
| 1362 | #else | 1519 | rem_load, sd, idle, all_pinned, this_best_prio, |
| 1363 | # define maxload rem_load_move | 1520 | tg->cfs_rq[busiest_cpu]); |
| 1364 | #endif | 1521 | |
| 1365 | /* | 1522 | if (!moved_load) |
| 1366 | * pass busy_cfs_rq argument into | 1523 | continue; |
| 1367 | * load_balance_[start|next]_fair iterators | 1524 | |
| 1368 | */ | 1525 | moved_load *= busiest_h_load; |
| 1369 | cfs_rq_iterator.arg = busy_cfs_rq; | 1526 | moved_load = div_u64(moved_load, busiest_weight + 1); |
| 1370 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
| 1371 | maxload, sd, idle, all_pinned, | ||
| 1372 | this_best_prio, | ||
| 1373 | &cfs_rq_iterator); | ||
| 1374 | 1527 | ||
| 1375 | if (rem_load_move <= 0) | 1528 | rem_load_move -= moved_load; |
| 1529 | if (rem_load_move < 0) | ||
| 1376 | break; | 1530 | break; |
| 1377 | } | 1531 | } |
| 1532 | rcu_read_unlock(); | ||
| 1378 | 1533 | ||
| 1379 | return max_load_move - rem_load_move; | 1534 | return max_load_move - rem_load_move; |
| 1380 | } | 1535 | } |
| 1536 | #else | ||
| 1537 | static unsigned long | ||
| 1538 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1539 | unsigned long max_load_move, | ||
| 1540 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1541 | int *all_pinned, int *this_best_prio) | ||
| 1542 | { | ||
| 1543 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
| 1544 | max_load_move, sd, idle, all_pinned, | ||
| 1545 | this_best_prio, &busiest->cfs); | ||
| 1546 | } | ||
| 1547 | #endif | ||
| 1381 | 1548 | ||
| 1382 | static int | 1549 | static int |
| 1383 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1550 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| @@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1402 | 1569 | ||
| 1403 | return 0; | 1570 | return 0; |
| 1404 | } | 1571 | } |
| 1405 | #endif | 1572 | #endif /* CONFIG_SMP */ |
| 1406 | 1573 | ||
| 1407 | /* | 1574 | /* |
| 1408 | * scheduler tick hitting a task of our scheduling class: | 1575 | * scheduler tick hitting a task of our scheduling class: |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1c7283cb9581..862b06bd560a 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) |
| 2 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | ||
| 2 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | 3 | SCHED_FEAT(WAKEUP_PREEMPT, 1) |
| 3 | SCHED_FEAT(START_DEBIT, 1) | 4 | SCHED_FEAT(START_DEBIT, 1) |
| 4 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | 5 | SCHED_FEAT(AFFINE_WAKEUPS, 1) |
| @@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) | |||
| 6 | SCHED_FEAT(SYNC_WAKEUPS, 1) | 7 | SCHED_FEAT(SYNC_WAKEUPS, 1) |
| 7 | SCHED_FEAT(HRTICK, 1) | 8 | SCHED_FEAT(HRTICK, 1) |
| 8 | SCHED_FEAT(DOUBLE_TICK, 0) | 9 | SCHED_FEAT(DOUBLE_TICK, 0) |
| 9 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | 10 | SCHED_FEAT(ASYM_GRAN, 1) |
| 10 | SCHED_FEAT(DEADLINE, 1) | 11 | SCHED_FEAT(LB_BIAS, 0) |
| 12 | SCHED_FEAT(LB_WAKEUP_UPDATE, 1) | ||
| 13 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0f3c19197fa4..47ceac9e8552 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq) | |||
| 12 | 12 | ||
| 13 | static inline void rt_set_overload(struct rq *rq) | 13 | static inline void rt_set_overload(struct rq *rq) |
| 14 | { | 14 | { |
| 15 | if (!rq->online) | ||
| 16 | return; | ||
| 17 | |||
| 15 | cpu_set(rq->cpu, rq->rd->rto_mask); | 18 | cpu_set(rq->cpu, rq->rd->rto_mask); |
| 16 | /* | 19 | /* |
| 17 | * Make sure the mask is visible before we set | 20 | * Make sure the mask is visible before we set |
| @@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq) | |||
| 26 | 29 | ||
| 27 | static inline void rt_clear_overload(struct rq *rq) | 30 | static inline void rt_clear_overload(struct rq *rq) |
| 28 | { | 31 | { |
| 32 | if (!rq->online) | ||
| 33 | return; | ||
| 34 | |||
| 29 | /* the order here really doesn't matter */ | 35 | /* the order here really doesn't matter */ |
| 30 | atomic_dec(&rq->rd->rto_count); | 36 | atomic_dec(&rq->rd->rto_count); |
| 31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | 37 | cpu_clear(rq->cpu, rq->rd->rto_mask); |
| @@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
| 155 | return &rt_rq->tg->rt_bandwidth; | 161 | return &rt_rq->tg->rt_bandwidth; |
| 156 | } | 162 | } |
| 157 | 163 | ||
| 158 | #else | 164 | #else /* !CONFIG_RT_GROUP_SCHED */ |
| 159 | 165 | ||
| 160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 166 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
| 161 | { | 167 | { |
| @@ -220,49 +226,10 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
| 220 | return &def_rt_bandwidth; | 226 | return &def_rt_bandwidth; |
| 221 | } | 227 | } |
| 222 | 228 | ||
| 223 | #endif | 229 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 224 | |||
| 225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
| 226 | { | ||
| 227 | int i, idle = 1; | ||
| 228 | cpumask_t span; | ||
| 229 | |||
| 230 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
| 231 | return 1; | ||
| 232 | |||
| 233 | span = sched_rt_period_mask(); | ||
| 234 | for_each_cpu_mask(i, span) { | ||
| 235 | int enqueue = 0; | ||
| 236 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
| 237 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 238 | |||
| 239 | spin_lock(&rq->lock); | ||
| 240 | if (rt_rq->rt_time) { | ||
| 241 | u64 runtime; | ||
| 242 | |||
| 243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 244 | runtime = rt_rq->rt_runtime; | ||
| 245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
| 246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
| 247 | rt_rq->rt_throttled = 0; | ||
| 248 | enqueue = 1; | ||
| 249 | } | ||
| 250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
| 251 | idle = 0; | ||
| 252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 253 | } else if (rt_rq->rt_nr_running) | ||
| 254 | idle = 0; | ||
| 255 | |||
| 256 | if (enqueue) | ||
| 257 | sched_rt_rq_enqueue(rt_rq); | ||
| 258 | spin_unlock(&rq->lock); | ||
| 259 | } | ||
| 260 | |||
| 261 | return idle; | ||
| 262 | } | ||
| 263 | 230 | ||
| 264 | #ifdef CONFIG_SMP | 231 | #ifdef CONFIG_SMP |
| 265 | static int balance_runtime(struct rt_rq *rt_rq) | 232 | static int do_balance_runtime(struct rt_rq *rt_rq) |
| 266 | { | 233 | { |
| 267 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 234 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| 268 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | 235 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; |
| @@ -281,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq) | |||
| 281 | continue; | 248 | continue; |
| 282 | 249 | ||
| 283 | spin_lock(&iter->rt_runtime_lock); | 250 | spin_lock(&iter->rt_runtime_lock); |
| 251 | if (iter->rt_runtime == RUNTIME_INF) | ||
| 252 | goto next; | ||
| 253 | |||
| 284 | diff = iter->rt_runtime - iter->rt_time; | 254 | diff = iter->rt_runtime - iter->rt_time; |
| 285 | if (diff > 0) { | 255 | if (diff > 0) { |
| 286 | do_div(diff, weight); | 256 | do_div(diff, weight); |
| @@ -294,13 +264,163 @@ static int balance_runtime(struct rt_rq *rt_rq) | |||
| 294 | break; | 264 | break; |
| 295 | } | 265 | } |
| 296 | } | 266 | } |
| 267 | next: | ||
| 297 | spin_unlock(&iter->rt_runtime_lock); | 268 | spin_unlock(&iter->rt_runtime_lock); |
| 298 | } | 269 | } |
| 299 | spin_unlock(&rt_b->rt_runtime_lock); | 270 | spin_unlock(&rt_b->rt_runtime_lock); |
| 300 | 271 | ||
| 301 | return more; | 272 | return more; |
| 302 | } | 273 | } |
| 303 | #endif | 274 | |
| 275 | static void __disable_runtime(struct rq *rq) | ||
| 276 | { | ||
| 277 | struct root_domain *rd = rq->rd; | ||
| 278 | struct rt_rq *rt_rq; | ||
| 279 | |||
| 280 | if (unlikely(!scheduler_running)) | ||
| 281 | return; | ||
| 282 | |||
| 283 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
| 284 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
| 285 | s64 want; | ||
| 286 | int i; | ||
| 287 | |||
| 288 | spin_lock(&rt_b->rt_runtime_lock); | ||
| 289 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 290 | if (rt_rq->rt_runtime == RUNTIME_INF || | ||
| 291 | rt_rq->rt_runtime == rt_b->rt_runtime) | ||
| 292 | goto balanced; | ||
| 293 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 294 | |||
| 295 | want = rt_b->rt_runtime - rt_rq->rt_runtime; | ||
| 296 | |||
| 297 | for_each_cpu_mask(i, rd->span) { | ||
| 298 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
| 299 | s64 diff; | ||
| 300 | |||
| 301 | if (iter == rt_rq) | ||
| 302 | continue; | ||
| 303 | |||
| 304 | spin_lock(&iter->rt_runtime_lock); | ||
| 305 | if (want > 0) { | ||
| 306 | diff = min_t(s64, iter->rt_runtime, want); | ||
| 307 | iter->rt_runtime -= diff; | ||
| 308 | want -= diff; | ||
| 309 | } else { | ||
| 310 | iter->rt_runtime -= want; | ||
| 311 | want -= want; | ||
| 312 | } | ||
| 313 | spin_unlock(&iter->rt_runtime_lock); | ||
| 314 | |||
| 315 | if (!want) | ||
| 316 | break; | ||
| 317 | } | ||
| 318 | |||
| 319 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 320 | BUG_ON(want); | ||
| 321 | balanced: | ||
| 322 | rt_rq->rt_runtime = RUNTIME_INF; | ||
| 323 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 324 | spin_unlock(&rt_b->rt_runtime_lock); | ||
| 325 | } | ||
| 326 | } | ||
| 327 | |||
| 328 | static void disable_runtime(struct rq *rq) | ||
| 329 | { | ||
| 330 | unsigned long flags; | ||
| 331 | |||
| 332 | spin_lock_irqsave(&rq->lock, flags); | ||
| 333 | __disable_runtime(rq); | ||
| 334 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 335 | } | ||
| 336 | |||
| 337 | static void __enable_runtime(struct rq *rq) | ||
| 338 | { | ||
| 339 | struct rt_rq *rt_rq; | ||
| 340 | |||
| 341 | if (unlikely(!scheduler_running)) | ||
| 342 | return; | ||
| 343 | |||
| 344 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
| 345 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
| 346 | |||
| 347 | spin_lock(&rt_b->rt_runtime_lock); | ||
| 348 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 349 | rt_rq->rt_runtime = rt_b->rt_runtime; | ||
| 350 | rt_rq->rt_time = 0; | ||
| 351 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 352 | spin_unlock(&rt_b->rt_runtime_lock); | ||
| 353 | } | ||
| 354 | } | ||
| 355 | |||
| 356 | static void enable_runtime(struct rq *rq) | ||
| 357 | { | ||
| 358 | unsigned long flags; | ||
| 359 | |||
| 360 | spin_lock_irqsave(&rq->lock, flags); | ||
| 361 | __enable_runtime(rq); | ||
| 362 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 363 | } | ||
| 364 | |||
| 365 | static int balance_runtime(struct rt_rq *rt_rq) | ||
| 366 | { | ||
| 367 | int more = 0; | ||
| 368 | |||
| 369 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | ||
| 370 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 371 | more = do_balance_runtime(rt_rq); | ||
| 372 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 373 | } | ||
| 374 | |||
| 375 | return more; | ||
| 376 | } | ||
| 377 | #else /* !CONFIG_SMP */ | ||
| 378 | static inline int balance_runtime(struct rt_rq *rt_rq) | ||
| 379 | { | ||
| 380 | return 0; | ||
| 381 | } | ||
| 382 | #endif /* CONFIG_SMP */ | ||
| 383 | |||
| 384 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
| 385 | { | ||
| 386 | int i, idle = 1; | ||
| 387 | cpumask_t span; | ||
| 388 | |||
| 389 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
| 390 | return 1; | ||
| 391 | |||
| 392 | span = sched_rt_period_mask(); | ||
| 393 | for_each_cpu_mask(i, span) { | ||
| 394 | int enqueue = 0; | ||
| 395 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
| 396 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 397 | |||
| 398 | spin_lock(&rq->lock); | ||
| 399 | if (rt_rq->rt_time) { | ||
| 400 | u64 runtime; | ||
| 401 | |||
| 402 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 403 | if (rt_rq->rt_throttled) | ||
| 404 | balance_runtime(rt_rq); | ||
| 405 | runtime = rt_rq->rt_runtime; | ||
| 406 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
| 407 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
| 408 | rt_rq->rt_throttled = 0; | ||
| 409 | enqueue = 1; | ||
| 410 | } | ||
| 411 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
| 412 | idle = 0; | ||
| 413 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 414 | } else if (rt_rq->rt_nr_running) | ||
| 415 | idle = 0; | ||
| 416 | |||
| 417 | if (enqueue) | ||
| 418 | sched_rt_rq_enqueue(rt_rq); | ||
| 419 | spin_unlock(&rq->lock); | ||
| 420 | } | ||
| 421 | |||
| 422 | return idle; | ||
| 423 | } | ||
| 304 | 424 | ||
| 305 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 425 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
| 306 | { | 426 | { |
| @@ -327,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 327 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | 447 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) |
| 328 | return 0; | 448 | return 0; |
| 329 | 449 | ||
| 330 | #ifdef CONFIG_SMP | 450 | balance_runtime(rt_rq); |
| 331 | if (rt_rq->rt_time > runtime) { | 451 | runtime = sched_rt_runtime(rt_rq); |
| 332 | int more; | 452 | if (runtime == RUNTIME_INF) |
| 333 | 453 | return 0; | |
| 334 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 335 | more = balance_runtime(rt_rq); | ||
| 336 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 337 | |||
| 338 | if (more) | ||
| 339 | runtime = sched_rt_runtime(rt_rq); | ||
| 340 | } | ||
| 341 | #endif | ||
| 342 | 454 | ||
| 343 | if (rt_rq->rt_time > runtime) { | 455 | if (rt_rq->rt_time > runtime) { |
| 344 | rt_rq->rt_throttled = 1; | 456 | rt_rq->rt_throttled = 1; |
| @@ -392,12 +504,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 392 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | 504 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); |
| 393 | rt_rq->rt_nr_running++; | 505 | rt_rq->rt_nr_running++; |
| 394 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 506 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
| 395 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | 507 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) { |
| 508 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 509 | |||
| 396 | rt_rq->highest_prio = rt_se_prio(rt_se); | 510 | rt_rq->highest_prio = rt_se_prio(rt_se); |
| 511 | #ifdef CONFIG_SMP | ||
| 512 | if (rq->online) | ||
| 513 | cpupri_set(&rq->rd->cpupri, rq->cpu, | ||
| 514 | rt_se_prio(rt_se)); | ||
| 515 | #endif | ||
| 516 | } | ||
| 397 | #endif | 517 | #endif |
| 398 | #ifdef CONFIG_SMP | 518 | #ifdef CONFIG_SMP |
| 399 | if (rt_se->nr_cpus_allowed > 1) { | 519 | if (rt_se->nr_cpus_allowed > 1) { |
| 400 | struct rq *rq = rq_of_rt_rq(rt_rq); | 520 | struct rq *rq = rq_of_rt_rq(rt_rq); |
| 521 | |||
| 401 | rq->rt.rt_nr_migratory++; | 522 | rq->rt.rt_nr_migratory++; |
| 402 | } | 523 | } |
| 403 | 524 | ||
| @@ -417,6 +538,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 417 | static inline | 538 | static inline |
| 418 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 539 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
| 419 | { | 540 | { |
| 541 | #ifdef CONFIG_SMP | ||
| 542 | int highest_prio = rt_rq->highest_prio; | ||
| 543 | #endif | ||
| 544 | |||
| 420 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | 545 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); |
| 421 | WARN_ON(!rt_rq->rt_nr_running); | 546 | WARN_ON(!rt_rq->rt_nr_running); |
| 422 | rt_rq->rt_nr_running--; | 547 | rt_rq->rt_nr_running--; |
| @@ -440,6 +565,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 440 | rq->rt.rt_nr_migratory--; | 565 | rq->rt.rt_nr_migratory--; |
| 441 | } | 566 | } |
| 442 | 567 | ||
| 568 | if (rt_rq->highest_prio != highest_prio) { | ||
| 569 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 570 | |||
| 571 | if (rq->online) | ||
| 572 | cpupri_set(&rq->rd->cpupri, rq->cpu, | ||
| 573 | rt_rq->highest_prio); | ||
| 574 | } | ||
| 575 | |||
| 443 | update_rt_migration(rq_of_rt_rq(rt_rq)); | 576 | update_rt_migration(rq_of_rt_rq(rt_rq)); |
| 444 | #endif /* CONFIG_SMP */ | 577 | #endif /* CONFIG_SMP */ |
| 445 | #ifdef CONFIG_RT_GROUP_SCHED | 578 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -455,6 +588,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 455 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 588 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
| 456 | struct rt_prio_array *array = &rt_rq->active; | 589 | struct rt_prio_array *array = &rt_rq->active; |
| 457 | struct rt_rq *group_rq = group_rt_rq(rt_se); | 590 | struct rt_rq *group_rq = group_rt_rq(rt_se); |
| 591 | struct list_head *queue = array->queue + rt_se_prio(rt_se); | ||
| 458 | 592 | ||
| 459 | /* | 593 | /* |
| 460 | * Don't enqueue the group if its throttled, or when empty. | 594 | * Don't enqueue the group if its throttled, or when empty. |
| @@ -465,7 +599,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 465 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 599 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
| 466 | return; | 600 | return; |
| 467 | 601 | ||
| 468 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | 602 | if (rt_se->nr_cpus_allowed == 1) |
| 603 | list_add(&rt_se->run_list, queue); | ||
| 604 | else | ||
| 605 | list_add_tail(&rt_se->run_list, queue); | ||
| 606 | |||
| 469 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 607 | __set_bit(rt_se_prio(rt_se), array->bitmap); |
| 470 | 608 | ||
| 471 | inc_rt_tasks(rt_se, rt_rq); | 609 | inc_rt_tasks(rt_se, rt_rq); |
| @@ -532,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 532 | rt_se->timeout = 0; | 670 | rt_se->timeout = 0; |
| 533 | 671 | ||
| 534 | enqueue_rt_entity(rt_se); | 672 | enqueue_rt_entity(rt_se); |
| 673 | |||
| 674 | inc_cpu_load(rq, p->se.load.weight); | ||
| 535 | } | 675 | } |
| 536 | 676 | ||
| 537 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 677 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
| @@ -540,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
| 540 | 680 | ||
| 541 | update_curr_rt(rq); | 681 | update_curr_rt(rq); |
| 542 | dequeue_rt_entity(rt_se); | 682 | dequeue_rt_entity(rt_se); |
| 683 | |||
| 684 | dec_cpu_load(rq, p->se.load.weight); | ||
| 543 | } | 685 | } |
| 544 | 686 | ||
| 545 | /* | 687 | /* |
| @@ -550,10 +692,12 @@ static | |||
| 550 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | 692 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) |
| 551 | { | 693 | { |
| 552 | struct rt_prio_array *array = &rt_rq->active; | 694 | struct rt_prio_array *array = &rt_rq->active; |
| 553 | struct list_head *queue = array->queue + rt_se_prio(rt_se); | ||
| 554 | 695 | ||
| 555 | if (on_rt_rq(rt_se)) | 696 | if (on_rt_rq(rt_se)) { |
| 556 | list_move_tail(&rt_se->run_list, queue); | 697 | list_del_init(&rt_se->run_list); |
| 698 | list_add_tail(&rt_se->run_list, | ||
| 699 | array->queue + rt_se_prio(rt_se)); | ||
| 700 | } | ||
| 557 | } | 701 | } |
| 558 | 702 | ||
| 559 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 703 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
| @@ -616,8 +760,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync) | |||
| 616 | */ | 760 | */ |
| 617 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | 761 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) |
| 618 | { | 762 | { |
| 619 | if (p->prio < rq->curr->prio) | 763 | if (p->prio < rq->curr->prio) { |
| 620 | resched_task(rq->curr); | 764 | resched_task(rq->curr); |
| 765 | return; | ||
| 766 | } | ||
| 767 | |||
| 768 | #ifdef CONFIG_SMP | ||
| 769 | /* | ||
| 770 | * If: | ||
| 771 | * | ||
| 772 | * - the newly woken task is of equal priority to the current task | ||
| 773 | * - the newly woken task is non-migratable while current is migratable | ||
| 774 | * - current will be preempted on the next reschedule | ||
| 775 | * | ||
| 776 | * we should check to see if current can readily move to a different | ||
| 777 | * cpu. If so, we will reschedule to allow the push logic to try | ||
| 778 | * to move current somewhere else, making room for our non-migratable | ||
| 779 | * task. | ||
| 780 | */ | ||
| 781 | if((p->prio == rq->curr->prio) | ||
| 782 | && p->rt.nr_cpus_allowed == 1 | ||
| 783 | && rq->curr->rt.nr_cpus_allowed != 1) { | ||
| 784 | cpumask_t mask; | ||
| 785 | |||
| 786 | if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) | ||
| 787 | /* | ||
| 788 | * There appears to be other cpus that can accept | ||
| 789 | * current, so lets reschedule to try and push it away | ||
| 790 | */ | ||
| 791 | resched_task(rq->curr); | ||
| 792 | } | ||
| 793 | #endif | ||
| 621 | } | 794 | } |
| 622 | 795 | ||
| 623 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, | 796 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
| @@ -720,73 +893,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
| 720 | 893 | ||
| 721 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); | 894 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
| 722 | 895 | ||
| 723 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) | ||
| 724 | { | ||
| 725 | int lowest_prio = -1; | ||
| 726 | int lowest_cpu = -1; | ||
| 727 | int count = 0; | ||
| 728 | int cpu; | ||
| 729 | |||
| 730 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); | ||
| 731 | |||
| 732 | /* | ||
| 733 | * Scan each rq for the lowest prio. | ||
| 734 | */ | ||
| 735 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
| 736 | struct rq *rq = cpu_rq(cpu); | ||
| 737 | |||
| 738 | /* We look for lowest RT prio or non-rt CPU */ | ||
| 739 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
| 740 | /* | ||
| 741 | * if we already found a low RT queue | ||
| 742 | * and now we found this non-rt queue | ||
| 743 | * clear the mask and set our bit. | ||
| 744 | * Otherwise just return the queue as is | ||
| 745 | * and the count==1 will cause the algorithm | ||
| 746 | * to use the first bit found. | ||
| 747 | */ | ||
| 748 | if (lowest_cpu != -1) { | ||
| 749 | cpus_clear(*lowest_mask); | ||
| 750 | cpu_set(rq->cpu, *lowest_mask); | ||
| 751 | } | ||
| 752 | return 1; | ||
| 753 | } | ||
| 754 | |||
| 755 | /* no locking for now */ | ||
| 756 | if ((rq->rt.highest_prio > task->prio) | ||
| 757 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
| 758 | if (rq->rt.highest_prio > lowest_prio) { | ||
| 759 | /* new low - clear old data */ | ||
| 760 | lowest_prio = rq->rt.highest_prio; | ||
| 761 | lowest_cpu = cpu; | ||
| 762 | count = 0; | ||
| 763 | } | ||
| 764 | count++; | ||
| 765 | } else | ||
| 766 | cpu_clear(cpu, *lowest_mask); | ||
| 767 | } | ||
| 768 | |||
| 769 | /* | ||
| 770 | * Clear out all the set bits that represent | ||
| 771 | * runqueues that were of higher prio than | ||
| 772 | * the lowest_prio. | ||
| 773 | */ | ||
| 774 | if (lowest_cpu > 0) { | ||
| 775 | /* | ||
| 776 | * Perhaps we could add another cpumask op to | ||
| 777 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
| 778 | * Then that could be optimized to use memset and such. | ||
| 779 | */ | ||
| 780 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
| 781 | if (cpu >= lowest_cpu) | ||
| 782 | break; | ||
| 783 | cpu_clear(cpu, *lowest_mask); | ||
| 784 | } | ||
| 785 | } | ||
| 786 | |||
| 787 | return count; | ||
| 788 | } | ||
| 789 | |||
| 790 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) | 896 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
| 791 | { | 897 | { |
| 792 | int first; | 898 | int first; |
| @@ -808,17 +914,12 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 808 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | 914 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); |
| 809 | int this_cpu = smp_processor_id(); | 915 | int this_cpu = smp_processor_id(); |
| 810 | int cpu = task_cpu(task); | 916 | int cpu = task_cpu(task); |
| 811 | int count = find_lowest_cpus(task, lowest_mask); | ||
| 812 | 917 | ||
| 813 | if (!count) | 918 | if (task->rt.nr_cpus_allowed == 1) |
| 814 | return -1; /* No targets found */ | 919 | return -1; /* No other targets possible */ |
| 815 | 920 | ||
| 816 | /* | 921 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
| 817 | * There is no sense in performing an optimal search if only one | 922 | return -1; /* No targets found */ |
| 818 | * target is found. | ||
| 819 | */ | ||
| 820 | if (count == 1) | ||
| 821 | return first_cpu(*lowest_mask); | ||
| 822 | 923 | ||
| 823 | /* | 924 | /* |
| 824 | * At this point we have built a mask of cpus representing the | 925 | * At this point we have built a mask of cpus representing the |
| @@ -1163,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1163 | } | 1264 | } |
| 1164 | 1265 | ||
| 1165 | /* Assumes rq->lock is held */ | 1266 | /* Assumes rq->lock is held */ |
| 1166 | static void join_domain_rt(struct rq *rq) | 1267 | static void rq_online_rt(struct rq *rq) |
| 1167 | { | 1268 | { |
| 1168 | if (rq->rt.overloaded) | 1269 | if (rq->rt.overloaded) |
| 1169 | rt_set_overload(rq); | 1270 | rt_set_overload(rq); |
| 1271 | |||
| 1272 | __enable_runtime(rq); | ||
| 1273 | |||
| 1274 | cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); | ||
| 1170 | } | 1275 | } |
| 1171 | 1276 | ||
| 1172 | /* Assumes rq->lock is held */ | 1277 | /* Assumes rq->lock is held */ |
| 1173 | static void leave_domain_rt(struct rq *rq) | 1278 | static void rq_offline_rt(struct rq *rq) |
| 1174 | { | 1279 | { |
| 1175 | if (rq->rt.overloaded) | 1280 | if (rq->rt.overloaded) |
| 1176 | rt_clear_overload(rq); | 1281 | rt_clear_overload(rq); |
| 1282 | |||
| 1283 | __disable_runtime(rq); | ||
| 1284 | |||
| 1285 | cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); | ||
| 1177 | } | 1286 | } |
| 1178 | 1287 | ||
| 1179 | /* | 1288 | /* |
| @@ -1336,8 +1445,8 @@ static const struct sched_class rt_sched_class = { | |||
| 1336 | .load_balance = load_balance_rt, | 1445 | .load_balance = load_balance_rt, |
| 1337 | .move_one_task = move_one_task_rt, | 1446 | .move_one_task = move_one_task_rt, |
| 1338 | .set_cpus_allowed = set_cpus_allowed_rt, | 1447 | .set_cpus_allowed = set_cpus_allowed_rt, |
| 1339 | .join_domain = join_domain_rt, | 1448 | .rq_online = rq_online_rt, |
| 1340 | .leave_domain = leave_domain_rt, | 1449 | .rq_offline = rq_offline_rt, |
| 1341 | .pre_schedule = pre_schedule_rt, | 1450 | .pre_schedule = pre_schedule_rt, |
| 1342 | .post_schedule = post_schedule_rt, | 1451 | .post_schedule = post_schedule_rt, |
| 1343 | .task_wake_up = task_wake_up_rt, | 1452 | .task_wake_up = task_wake_up_rt, |
| @@ -1350,3 +1459,17 @@ static const struct sched_class rt_sched_class = { | |||
| 1350 | .prio_changed = prio_changed_rt, | 1459 | .prio_changed = prio_changed_rt, |
| 1351 | .switched_to = switched_to_rt, | 1460 | .switched_to = switched_to_rt, |
| 1352 | }; | 1461 | }; |
| 1462 | |||
| 1463 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1464 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | ||
| 1465 | |||
| 1466 | static void print_rt_stats(struct seq_file *m, int cpu) | ||
| 1467 | { | ||
| 1468 | struct rt_rq *rt_rq; | ||
| 1469 | |||
| 1470 | rcu_read_lock(); | ||
| 1471 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | ||
| 1472 | print_rt_rq(m, cpu, rt_rq); | ||
| 1473 | rcu_read_unlock(); | ||
| 1474 | } | ||
| 1475 | #endif /* CONFIG_SCHED_DEBUG */ | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 80179ef7450e..8385d43987e2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
| @@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
| 118 | if (rq) | 118 | if (rq) |
| 119 | rq->rq_sched_info.cpu_time += delta; | 119 | rq->rq_sched_info.cpu_time += delta; |
| 120 | } | 120 | } |
| 121 | |||
| 122 | static inline void | ||
| 123 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | ||
| 124 | { | ||
| 125 | if (rq) | ||
| 126 | rq->rq_sched_info.run_delay += delta; | ||
| 127 | } | ||
| 121 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 128 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
| 122 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 129 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
| 123 | # define schedstat_set(var, val) do { var = (val); } while (0) | 130 | # define schedstat_set(var, val) do { var = (val); } while (0) |
| @@ -126,6 +133,9 @@ static inline void | |||
| 126 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 133 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
| 127 | {} | 134 | {} |
| 128 | static inline void | 135 | static inline void |
| 136 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | ||
| 137 | {} | ||
| 138 | static inline void | ||
| 129 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 139 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
| 130 | {} | 140 | {} |
| 131 | # define schedstat_inc(rq, field) do { } while (0) | 141 | # define schedstat_inc(rq, field) do { } while (0) |
| @@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
| 134 | #endif | 144 | #endif |
| 135 | 145 | ||
| 136 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 146 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 147 | static inline void sched_info_reset_dequeued(struct task_struct *t) | ||
| 148 | { | ||
| 149 | t->sched_info.last_queued = 0; | ||
| 150 | } | ||
| 151 | |||
| 137 | /* | 152 | /* |
| 138 | * Called when a process is dequeued from the active array and given | 153 | * Called when a process is dequeued from the active array and given |
| 139 | * the cpu. We should note that with the exception of interactive | 154 | * the cpu. We should note that with the exception of interactive |
| @@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
| 143 | * active queue, thus delaying tasks in the expired queue from running; | 158 | * active queue, thus delaying tasks in the expired queue from running; |
| 144 | * see scheduler_tick()). | 159 | * see scheduler_tick()). |
| 145 | * | 160 | * |
| 146 | * This function is only called from sched_info_arrive(), rather than | 161 | * Though we are interested in knowing how long it was from the *first* time a |
| 147 | * dequeue_task(). Even though a task may be queued and dequeued multiple | 162 | * task was queued to the time that it finally hit a cpu, we call this routine |
| 148 | * times as it is shuffled about, we're really interested in knowing how | 163 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
| 149 | * long it was from the *first* time it was queued to the time that it | 164 | * delta taken on each cpu would annul the skew. |
| 150 | * finally hit a cpu. | ||
| 151 | */ | 165 | */ |
| 152 | static inline void sched_info_dequeued(struct task_struct *t) | 166 | static inline void sched_info_dequeued(struct task_struct *t) |
| 153 | { | 167 | { |
| 154 | t->sched_info.last_queued = 0; | 168 | unsigned long long now = task_rq(t)->clock, delta = 0; |
| 169 | |||
| 170 | if (unlikely(sched_info_on())) | ||
| 171 | if (t->sched_info.last_queued) | ||
| 172 | delta = now - t->sched_info.last_queued; | ||
| 173 | sched_info_reset_dequeued(t); | ||
| 174 | t->sched_info.run_delay += delta; | ||
| 175 | |||
| 176 | rq_sched_info_dequeued(task_rq(t), delta); | ||
| 155 | } | 177 | } |
| 156 | 178 | ||
| 157 | /* | 179 | /* |
| @@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
| 165 | 187 | ||
| 166 | if (t->sched_info.last_queued) | 188 | if (t->sched_info.last_queued) |
| 167 | delta = now - t->sched_info.last_queued; | 189 | delta = now - t->sched_info.last_queued; |
| 168 | sched_info_dequeued(t); | 190 | sched_info_reset_dequeued(t); |
| 169 | t->sched_info.run_delay += delta; | 191 | t->sched_info.run_delay += delta; |
| 170 | t->sched_info.last_arrival = now; | 192 | t->sched_info.last_arrival = now; |
| 171 | t->sched_info.pcount++; | 193 | t->sched_info.pcount++; |
| @@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
| 242 | __sched_info_switch(prev, next); | 264 | __sched_info_switch(prev, next); |
| 243 | } | 265 | } |
| 244 | #else | 266 | #else |
| 245 | #define sched_info_queued(t) do { } while (0) | 267 | #define sched_info_queued(t) do { } while (0) |
| 246 | #define sched_info_switch(t, next) do { } while (0) | 268 | #define sched_info_reset_dequeued(t) do { } while (0) |
| 269 | #define sched_info_dequeued(t) do { } while (0) | ||
| 270 | #define sched_info_switch(t, next) do { } while (0) | ||
| 247 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 271 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
| 248 | 272 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca8..fe8cdc80ff02 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -266,6 +266,14 @@ static struct ctl_table kern_table[] = { | |||
| 266 | }, | 266 | }, |
| 267 | { | 267 | { |
| 268 | .ctl_name = CTL_UNNUMBERED, | 268 | .ctl_name = CTL_UNNUMBERED, |
| 269 | .procname = "sched_shares_ratelimit", | ||
| 270 | .data = &sysctl_sched_shares_ratelimit, | ||
| 271 | .maxlen = sizeof(unsigned int), | ||
| 272 | .mode = 0644, | ||
| 273 | .proc_handler = &proc_dointvec, | ||
| 274 | }, | ||
| 275 | { | ||
| 276 | .ctl_name = CTL_UNNUMBERED, | ||
| 269 | .procname = "sched_child_runs_first", | 277 | .procname = "sched_child_runs_first", |
| 270 | .data = &sysctl_sched_child_runs_first, | 278 | .data = &sysctl_sched_child_runs_first, |
| 271 | .maxlen = sizeof(unsigned int), | 279 | .maxlen = sizeof(unsigned int), |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591e..d63008b09a4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) | |||
| 276 | ts->tick_stopped = 1; | 276 | ts->tick_stopped = 1; |
| 277 | ts->idle_jiffies = last_jiffies; | 277 | ts->idle_jiffies = last_jiffies; |
| 278 | rcu_enter_nohz(); | 278 | rcu_enter_nohz(); |
| 279 | sched_clock_tick_stop(cpu); | ||
| 279 | } | 280 | } |
| 280 | 281 | ||
| 281 | /* | 282 | /* |
| @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) | |||
| 375 | select_nohz_load_balancer(0); | 376 | select_nohz_load_balancer(0); |
| 376 | now = ktime_get(); | 377 | now = ktime_get(); |
| 377 | tick_do_update_jiffies64(now); | 378 | tick_do_update_jiffies64(now); |
| 379 | sched_clock_tick_start(cpu); | ||
| 378 | cpu_clear(cpu, nohz_cpu_mask); | 380 | cpu_clear(cpu, nohz_cpu_mask); |
| 379 | 381 | ||
| 380 | /* | 382 | /* |
