aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c354
1 files changed, 134 insertions, 220 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5bc120e..cfa222a91539 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77/* 77/*
78 * Scheduler clock - returns current time in nanosec units.
79 * This is default implementation.
80 * Architectures and sub-architectures can override this.
81 */
82unsigned long long __attribute__((weak)) sched_clock(void)
83{
84 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85}
86
87/*
88 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
89 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90 * and back. 80 * and back.
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
242} 232}
243#endif 233#endif
244 234
235/*
236 * sched_domains_mutex serializes calls to arch_init_sched_domains,
237 * detach_destroy_domains and partition_sched_domains.
238 */
239static DEFINE_MUTEX(sched_domains_mutex);
240
245#ifdef CONFIG_GROUP_SCHED 241#ifdef CONFIG_GROUP_SCHED
246 242
247#include <linux/cgroup.h> 243#include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
308 */ 304 */
309static DEFINE_SPINLOCK(task_group_lock); 305static DEFINE_SPINLOCK(task_group_lock);
310 306
311/* doms_cur_mutex serializes access to doms_cur[] array */
312static DEFINE_MUTEX(doms_cur_mutex);
313
314#ifdef CONFIG_FAIR_GROUP_SCHED 307#ifdef CONFIG_FAIR_GROUP_SCHED
315#ifdef CONFIG_USER_SCHED 308#ifdef CONFIG_USER_SCHED
316# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
319#endif 312#endif
320 313
314/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
316 * (The default weight is 1024 - so there's no practical
317 * limitation from this.)
318 */
321#define MIN_SHARES 2 319#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1)
322 321
323static int init_task_group_load = INIT_TASK_GROUP_LOAD; 322static int init_task_group_load = INIT_TASK_GROUP_LOAD;
324#endif 323#endif
@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
358#endif 357#endif
359} 358}
360 359
361static inline void lock_doms_cur(void)
362{
363 mutex_lock(&doms_cur_mutex);
364}
365
366static inline void unlock_doms_cur(void)
367{
368 mutex_unlock(&doms_cur_mutex);
369}
370
371#else 360#else
372 361
373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline void lock_doms_cur(void) { }
375static inline void unlock_doms_cur(void) { }
376 363
377#endif /* CONFIG_GROUP_SCHED */ 364#endif /* CONFIG_GROUP_SCHED */
378 365
@@ -560,13 +547,7 @@ struct rq {
560 unsigned long next_balance; 547 unsigned long next_balance;
561 struct mm_struct *prev_mm; 548 struct mm_struct *prev_mm;
562 549
563 u64 clock, prev_clock_raw; 550 u64 clock;
564 s64 clock_max_delta;
565
566 unsigned int clock_warps, clock_overflows, clock_underflows;
567 u64 idle_clock;
568 unsigned int clock_deep_idle_events;
569 u64 tick_timestamp;
570 551
571 atomic_t nr_iowait; 552 atomic_t nr_iowait;
572 553
@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
631#endif 612#endif
632} 613}
633 614
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
660/*
661 * Update the per-runqueue clock, as finegrained as the platform can give
662 * us, but without assuming monotonicity, etc.:
663 */
664static void __update_rq_clock(struct rq *rq)
665{
666 u64 prev_raw = rq->prev_clock_raw;
667 u64 now = sched_clock();
668 s64 delta = now - prev_raw;
669 u64 clock = rq->clock;
670
671#ifdef CONFIG_SCHED_DEBUG
672 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
673#endif
674 /*
675 * Protect against sched_clock() occasionally going backwards:
676 */
677 if (unlikely(delta < 0)) {
678 clock++;
679 rq->clock_warps++;
680 } else {
681 /*
682 * Catch too large forward jumps too:
683 */
684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
685 u64 max_time = rq->tick_timestamp + max_jump;
686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
690 else
691 clock++;
692 rq->clock_overflows++;
693 } else {
694 if (unlikely(delta > rq->clock_max_delta))
695 rq->clock_max_delta = delta;
696 clock += delta;
697 }
698 }
699
700 rq->prev_clock_raw = now;
701 rq->clock = clock;
702}
703
704static void update_rq_clock(struct rq *rq)
705{
706 if (likely(smp_processor_id() == cpu_of(rq)))
707 __update_rq_clock(rq);
708}
709
710/* 615/*
711 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 616 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
712 * See detach_destroy_domains: synchronize_sched for details. 617 * See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
722#define task_rq(p) cpu_rq(task_cpu(p)) 627#define task_rq(p) cpu_rq(task_cpu(p))
723#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 628#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
724 629
630static inline void update_rq_clock(struct rq *rq)
631{
632 rq->clock = sched_clock_cpu(cpu_of(rq));
633}
634
725/* 635/*
726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 636 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
727 */ 637 */
@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
757#define SCHED_FEAT(name, enabled) \ 667#define SCHED_FEAT(name, enabled) \
758 #name , 668 #name ,
759 669
760__read_mostly char *sched_feat_names[] = { 670static __read_mostly char *sched_feat_names[] = {
761#include "sched_features.h" 671#include "sched_features.h"
762 NULL 672 NULL
763}; 673};
764 674
765#undef SCHED_FEAT 675#undef SCHED_FEAT
766 676
767int sched_feat_open(struct inode *inode, struct file *filp) 677static int sched_feat_open(struct inode *inode, struct file *filp)
768{ 678{
769 filp->private_data = inode->i_private; 679 filp->private_data = inode->i_private;
770 return 0; 680 return 0;
@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900} 810}
901 811
902static const unsigned long long time_sync_thresh = 100000; 812unsigned long long time_sync_thresh = 100000;
903 813
904static DEFINE_PER_CPU(unsigned long long, time_offset); 814static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); 815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
913static DEFINE_SPINLOCK(time_sync_lock); 823static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time; 824static unsigned long long prev_global_time;
915 825
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) 826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
917{ 827{
918 unsigned long flags; 828 /*
919 829 * We want this inlined, to not get tracer function calls
920 spin_lock_irqsave(&time_sync_lock, flags); 830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
921 834
922 if (time < prev_global_time) { 835 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time; 836 per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
926 prev_global_time = time; 839 prev_global_time = time;
927 } 840 }
928 841
929 spin_unlock_irqrestore(&time_sync_lock, flags); 842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
930 844
931 return time; 845 return time;
932} 846}
@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
934static unsigned long long __cpu_clock(int cpu) 848static unsigned long long __cpu_clock(int cpu)
935{ 849{
936 unsigned long long now; 850 unsigned long long now;
937 unsigned long flags;
938 struct rq *rq;
939 851
940 /* 852 /*
941 * Only call sched_clock() if the scheduler has already been 853 * Only call sched_clock() if the scheduler has already been
@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944 if (unlikely(!scheduler_running)) 856 if (unlikely(!scheduler_running))
945 return 0; 857 return 0;
946 858
947 local_irq_save(flags); 859 now = sched_clock_cpu(cpu);
948 rq = cpu_rq(cpu);
949 update_rq_clock(rq);
950 now = rq->clock;
951 local_irq_restore(flags);
952 860
953 return now; 861 return now;
954} 862}
@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
960unsigned long long cpu_clock(int cpu) 868unsigned long long cpu_clock(int cpu)
961{ 869{
962 unsigned long long prev_cpu_time, time, delta_time; 870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
963 872
873 local_irq_save(flags);
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu); 874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); 875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time; 876 delta_time = time-prev_cpu_time;
967 877
968 if (unlikely(delta_time > time_sync_thresh)) 878 if (unlikely(delta_time > time_sync_thresh)) {
969 time = __sync_cpu_clock(time, cpu); 879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
970 883
971 return time; 884 return time;
972} 885}
@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
1117 return rq; 1030 return rq;
1118} 1031}
1119 1032
1120/*
1121 * We are going deep-idle (irqs are disabled):
1122 */
1123void sched_clock_idle_sleep_event(void)
1124{
1125 struct rq *rq = cpu_rq(smp_processor_id());
1126
1127 spin_lock(&rq->lock);
1128 __update_rq_clock(rq);
1129 spin_unlock(&rq->lock);
1130 rq->clock_deep_idle_events++;
1131}
1132EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1133
1134/*
1135 * We just idled delta nanoseconds (called with irqs disabled):
1136 */
1137void sched_clock_idle_wakeup_event(u64 delta_ns)
1138{
1139 struct rq *rq = cpu_rq(smp_processor_id());
1140 u64 now = sched_clock();
1141
1142 rq->idle_clock += delta_ns;
1143 /*
1144 * Override the previous timestamp and ignore all
1145 * sched_clock() deltas that occured while we idled,
1146 * and use the PM-provided delta_ns to advance the
1147 * rq clock:
1148 */
1149 spin_lock(&rq->lock);
1150 rq->prev_clock_raw = now;
1151 rq->clock += delta_ns;
1152 spin_unlock(&rq->lock);
1153 touch_softlockup_watchdog();
1154}
1155EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1156
1157static void __resched_task(struct task_struct *p, int tif_bit); 1033static void __resched_task(struct task_struct *p, int tif_bit);
1158 1034
1159static inline void resched_task(struct task_struct *p) 1035static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
1189enum { 1065enum {
1190 HRTICK_SET, /* re-programm hrtick_timer */ 1066 HRTICK_SET, /* re-programm hrtick_timer */
1191 HRTICK_RESET, /* not a new slice */ 1067 HRTICK_RESET, /* not a new slice */
1068 HRTICK_BLOCK, /* stop hrtick operations */
1192}; 1069};
1193 1070
1194/* 1071/*
@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
1200{ 1077{
1201 if (!sched_feat(HRTICK)) 1078 if (!sched_feat(HRTICK))
1202 return 0; 1079 return 0;
1080 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1081 return 0;
1203 return hrtimer_is_hres_active(&rq->hrtick_timer); 1082 return hrtimer_is_hres_active(&rq->hrtick_timer);
1204} 1083}
1205 1084
@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1275 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1154 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1276 1155
1277 spin_lock(&rq->lock); 1156 spin_lock(&rq->lock);
1278 __update_rq_clock(rq); 1157 update_rq_clock(rq);
1279 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1158 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1280 spin_unlock(&rq->lock); 1159 spin_unlock(&rq->lock);
1281 1160
1282 return HRTIMER_NORESTART; 1161 return HRTIMER_NORESTART;
1283} 1162}
1284 1163
1285static inline void init_rq_hrtick(struct rq *rq) 1164static void hotplug_hrtick_disable(int cpu)
1165{
1166 struct rq *rq = cpu_rq(cpu);
1167 unsigned long flags;
1168
1169 spin_lock_irqsave(&rq->lock, flags);
1170 rq->hrtick_flags = 0;
1171 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1172 spin_unlock_irqrestore(&rq->lock, flags);
1173
1174 hrtick_clear(rq);
1175}
1176
1177static void hotplug_hrtick_enable(int cpu)
1178{
1179 struct rq *rq = cpu_rq(cpu);
1180 unsigned long flags;
1181
1182 spin_lock_irqsave(&rq->lock, flags);
1183 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1184 spin_unlock_irqrestore(&rq->lock, flags);
1185}
1186
1187static int
1188hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1189{
1190 int cpu = (int)(long)hcpu;
1191
1192 switch (action) {
1193 case CPU_UP_CANCELED:
1194 case CPU_UP_CANCELED_FROZEN:
1195 case CPU_DOWN_PREPARE:
1196 case CPU_DOWN_PREPARE_FROZEN:
1197 case CPU_DEAD:
1198 case CPU_DEAD_FROZEN:
1199 hotplug_hrtick_disable(cpu);
1200 return NOTIFY_OK;
1201
1202 case CPU_UP_PREPARE:
1203 case CPU_UP_PREPARE_FROZEN:
1204 case CPU_DOWN_FAILED:
1205 case CPU_DOWN_FAILED_FROZEN:
1206 case CPU_ONLINE:
1207 case CPU_ONLINE_FROZEN:
1208 hotplug_hrtick_enable(cpu);
1209 return NOTIFY_OK;
1210 }
1211
1212 return NOTIFY_DONE;
1213}
1214
1215static void init_hrtick(void)
1216{
1217 hotcpu_notifier(hotplug_hrtick, 0);
1218}
1219
1220static void init_rq_hrtick(struct rq *rq)
1286{ 1221{
1287 rq->hrtick_flags = 0; 1222 rq->hrtick_flags = 0;
1288 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1223 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
1319void hrtick_resched(void) 1254void hrtick_resched(void)
1320{ 1255{
1321} 1256}
1257
1258static inline void init_hrtick(void)
1259{
1260}
1322#endif 1261#endif
1323 1262
1324/* 1263/*
@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1438{ 1377{
1439 u64 tmp; 1378 u64 tmp;
1440 1379
1441 if (unlikely(!lw->inv_weight)) 1380 if (!lw->inv_weight)
1442 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); 1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
1443 1382
1444 tmp = (u64)delta_exec * weight; 1383 tmp = (u64)delta_exec * weight;
1445 /* 1384 /*
@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1748 1687
1749 if (shares < MIN_SHARES) 1688 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES; 1689 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES;
1751 1692
1752 __set_se_shares(tg->se[tcpu], shares); 1693 __set_se_shares(tg->se[tcpu], shares);
1753} 1694}
@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4339 struct rq *rq = this_rq(); 4280 struct rq *rq = this_rq();
4340 cputime64_t tmp; 4281 cputime64_t tmp;
4341 4282
4342 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) 4283 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4343 return account_guest_time(p, cputime); 4284 account_guest_time(p, cputime);
4285 return;
4286 }
4344 4287
4345 p->stime = cputime_add(p->stime, cputime); 4288 p->stime = cputime_add(p->stime, cputime);
4346 4289
@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
4404 int cpu = smp_processor_id(); 4347 int cpu = smp_processor_id();
4405 struct rq *rq = cpu_rq(cpu); 4348 struct rq *rq = cpu_rq(cpu);
4406 struct task_struct *curr = rq->curr; 4349 struct task_struct *curr = rq->curr;
4407 u64 next_tick = rq->tick_timestamp + TICK_NSEC; 4350
4351 sched_clock_tick();
4408 4352
4409 spin_lock(&rq->lock); 4353 spin_lock(&rq->lock);
4410 __update_rq_clock(rq); 4354 update_rq_clock(rq);
4411 /*
4412 * Let rq->clock advance by at least TICK_NSEC:
4413 */
4414 if (unlikely(rq->clock < next_tick)) {
4415 rq->clock = next_tick;
4416 rq->clock_underflows++;
4417 }
4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
4420 update_cpu_load(rq); 4355 update_cpu_load(rq);
4421 curr->sched_class->task_tick(rq, curr, 0); 4356 curr->sched_class->task_tick(rq, curr, 0);
4422 spin_unlock(&rq->lock); 4357 spin_unlock(&rq->lock);
@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
4570 * Do the rq-clock update outside the rq lock: 4505 * Do the rq-clock update outside the rq lock:
4571 */ 4506 */
4572 local_irq_disable(); 4507 local_irq_disable();
4573 __update_rq_clock(rq); 4508 update_rq_clock(rq);
4574 spin_lock(&rq->lock); 4509 spin_lock(&rq->lock);
4575 clear_tsk_need_resched(prev); 4510 clear_tsk_need_resched(prev);
4576 4511
@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
4595 prev->sched_class->put_prev_task(rq, prev); 4530 prev->sched_class->put_prev_task(rq, prev);
4596 next = pick_next_task(rq, prev); 4531 next = pick_next_task(rq, prev);
4597 4532
4598 sched_info_switch(prev, next);
4599
4600 if (likely(prev != next)) { 4533 if (likely(prev != next)) {
4534 sched_info_switch(prev, next);
4535
4601 rq->nr_switches++; 4536 rq->nr_switches++;
4602 rq->curr = next; 4537 rq->curr = next;
4603 ++*switch_count; 4538 ++*switch_count;
@@ -4632,8 +4567,6 @@ EXPORT_SYMBOL(schedule);
4632asmlinkage void __sched preempt_schedule(void) 4567asmlinkage void __sched preempt_schedule(void)
4633{ 4568{
4634 struct thread_info *ti = current_thread_info(); 4569 struct thread_info *ti = current_thread_info();
4635 struct task_struct *task = current;
4636 int saved_lock_depth;
4637 4570
4638 /* 4571 /*
4639 * If there is a non-zero preempt_count or interrupts are disabled, 4572 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -4644,16 +4577,7 @@ asmlinkage void __sched preempt_schedule(void)
4644 4577
4645 do { 4578 do {
4646 add_preempt_count(PREEMPT_ACTIVE); 4579 add_preempt_count(PREEMPT_ACTIVE);
4647
4648 /*
4649 * We keep the big kernel semaphore locked, but we
4650 * clear ->lock_depth so that schedule() doesnt
4651 * auto-release the semaphore:
4652 */
4653 saved_lock_depth = task->lock_depth;
4654 task->lock_depth = -1;
4655 schedule(); 4580 schedule();
4656 task->lock_depth = saved_lock_depth;
4657 sub_preempt_count(PREEMPT_ACTIVE); 4581 sub_preempt_count(PREEMPT_ACTIVE);
4658 4582
4659 /* 4583 /*
@@ -4674,26 +4598,15 @@ EXPORT_SYMBOL(preempt_schedule);
4674asmlinkage void __sched preempt_schedule_irq(void) 4598asmlinkage void __sched preempt_schedule_irq(void)
4675{ 4599{
4676 struct thread_info *ti = current_thread_info(); 4600 struct thread_info *ti = current_thread_info();
4677 struct task_struct *task = current;
4678 int saved_lock_depth;
4679 4601
4680 /* Catch callers which need to be fixed */ 4602 /* Catch callers which need to be fixed */
4681 BUG_ON(ti->preempt_count || !irqs_disabled()); 4603 BUG_ON(ti->preempt_count || !irqs_disabled());
4682 4604
4683 do { 4605 do {
4684 add_preempt_count(PREEMPT_ACTIVE); 4606 add_preempt_count(PREEMPT_ACTIVE);
4685
4686 /*
4687 * We keep the big kernel semaphore locked, but we
4688 * clear ->lock_depth so that schedule() doesnt
4689 * auto-release the semaphore:
4690 */
4691 saved_lock_depth = task->lock_depth;
4692 task->lock_depth = -1;
4693 local_irq_enable(); 4607 local_irq_enable();
4694 schedule(); 4608 schedule();
4695 local_irq_disable(); 4609 local_irq_disable();
4696 task->lock_depth = saved_lock_depth;
4697 sub_preempt_count(PREEMPT_ACTIVE); 4610 sub_preempt_count(PREEMPT_ACTIVE);
4698 4611
4699 /* 4612 /*
@@ -5612,7 +5525,6 @@ static void __cond_resched(void)
5612 } while (need_resched()); 5525 } while (need_resched());
5613} 5526}
5614 5527
5615#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
5616int __sched _cond_resched(void) 5528int __sched _cond_resched(void)
5617{ 5529{
5618 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 5530 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
@@ -5623,7 +5535,6 @@ int __sched _cond_resched(void)
5623 return 0; 5535 return 0;
5624} 5536}
5625EXPORT_SYMBOL(_cond_resched); 5537EXPORT_SYMBOL(_cond_resched);
5626#endif
5627 5538
5628/* 5539/*
5629 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 5540 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -5918,8 +5829,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5918 spin_unlock_irqrestore(&rq->lock, flags); 5829 spin_unlock_irqrestore(&rq->lock, flags);
5919 5830
5920 /* Set the preempt count _outside_ the spinlocks! */ 5831 /* Set the preempt count _outside_ the spinlocks! */
5832#if defined(CONFIG_PREEMPT)
5833 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5834#else
5921 task_thread_info(idle)->preempt_count = 0; 5835 task_thread_info(idle)->preempt_count = 0;
5922 5836#endif
5923 /* 5837 /*
5924 * The idle tasks have their own, simple scheduling class: 5838 * The idle tasks have their own, simple scheduling class:
5925 */ 5839 */
@@ -7755,7 +7669,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7755{ 7669{
7756 int i, j; 7670 int i, j;
7757 7671
7758 lock_doms_cur(); 7672 mutex_lock(&sched_domains_mutex);
7759 7673
7760 /* always unregister in case we don't destroy any domains */ 7674 /* always unregister in case we don't destroy any domains */
7761 unregister_sched_domain_sysctl(); 7675 unregister_sched_domain_sysctl();
@@ -7804,7 +7718,7 @@ match2:
7804 7718
7805 register_sched_domain_sysctl(); 7719 register_sched_domain_sysctl();
7806 7720
7807 unlock_doms_cur(); 7721 mutex_unlock(&sched_domains_mutex);
7808} 7722}
7809 7723
7810#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7724#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7727,10 @@ int arch_reinit_sched_domains(void)
7813 int err; 7727 int err;
7814 7728
7815 get_online_cpus(); 7729 get_online_cpus();
7730 mutex_lock(&sched_domains_mutex);
7816 detach_destroy_domains(&cpu_online_map); 7731 detach_destroy_domains(&cpu_online_map);
7817 err = arch_init_sched_domains(&cpu_online_map); 7732 err = arch_init_sched_domains(&cpu_online_map);
7733 mutex_unlock(&sched_domains_mutex);
7818 put_online_cpus(); 7734 put_online_cpus();
7819 7735
7820 return err; 7736 return err;
@@ -7932,13 +7848,16 @@ void __init sched_init_smp(void)
7932 BUG_ON(sched_group_nodes_bycpu == NULL); 7848 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif 7849#endif
7934 get_online_cpus(); 7850 get_online_cpus();
7851 mutex_lock(&sched_domains_mutex);
7935 arch_init_sched_domains(&cpu_online_map); 7852 arch_init_sched_domains(&cpu_online_map);
7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7853 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7937 if (cpus_empty(non_isolated_cpus)) 7854 if (cpus_empty(non_isolated_cpus))
7938 cpu_set(smp_processor_id(), non_isolated_cpus); 7855 cpu_set(smp_processor_id(), non_isolated_cpus);
7856 mutex_unlock(&sched_domains_mutex);
7939 put_online_cpus(); 7857 put_online_cpus();
7940 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7858 /* XXX: Theoretical race here - CPU may be hotplugged now */
7941 hotcpu_notifier(update_sched_domains, 0); 7859 hotcpu_notifier(update_sched_domains, 0);
7860 init_hrtick();
7942 7861
7943 /* Move init over to a non-isolated CPU */ 7862 /* Move init over to a non-isolated CPU */
7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 7863 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7944,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8025 7944
8026 se->my_q = cfs_rq; 7945 se->my_q = cfs_rq;
8027 se->load.weight = tg->shares; 7946 se->load.weight = tg->shares;
8028 se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); 7947 se->load.inv_weight = 0;
8029 se->parent = parent; 7948 se->parent = parent;
8030} 7949}
8031#endif 7950#endif
@@ -8149,8 +8068,6 @@ void __init sched_init(void)
8149 spin_lock_init(&rq->lock); 8068 spin_lock_init(&rq->lock);
8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8069 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8151 rq->nr_running = 0; 8070 rq->nr_running = 0;
8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
8154 init_cfs_rq(&rq->cfs, rq); 8071 init_cfs_rq(&rq->cfs, rq);
8155 init_rt_rq(&rq->rt, rq); 8072 init_rt_rq(&rq->rt, rq);
8156#ifdef CONFIG_FAIR_GROUP_SCHED 8073#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +8211,7 @@ EXPORT_SYMBOL(__might_sleep);
8294static void normalize_task(struct rq *rq, struct task_struct *p) 8211static void normalize_task(struct rq *rq, struct task_struct *p)
8295{ 8212{
8296 int on_rq; 8213 int on_rq;
8214
8297 update_rq_clock(rq); 8215 update_rq_clock(rq);
8298 on_rq = p->se.on_rq; 8216 on_rq = p->se.on_rq;
8299 if (on_rq) 8217 if (on_rq)
@@ -8325,7 +8243,6 @@ void normalize_rt_tasks(void)
8325 p->se.sleep_start = 0; 8243 p->se.sleep_start = 0;
8326 p->se.block_start = 0; 8244 p->se.block_start = 0;
8327#endif 8245#endif
8328 task_rq(p)->clock = 0;
8329 8246
8330 if (!rt_task(p)) { 8247 if (!rt_task(p)) {
8331 /* 8248 /*
@@ -8692,7 +8609,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8692 dequeue_entity(cfs_rq, se, 0); 8609 dequeue_entity(cfs_rq, se, 0);
8693 8610
8694 se->load.weight = shares; 8611 se->load.weight = shares;
8695 se->load.inv_weight = div64_u64((1ULL<<32), shares); 8612 se->load.inv_weight = 0;
8696 8613
8697 if (on_rq) 8614 if (on_rq)
8698 enqueue_entity(cfs_rq, se, 0); 8615 enqueue_entity(cfs_rq, se, 0);
@@ -8722,13 +8639,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8722 if (!tg->se[0]) 8639 if (!tg->se[0])
8723 return -EINVAL; 8640 return -EINVAL;
8724 8641
8725 /*
8726 * A weight of 0 or 1 can cause arithmetics problems.
8727 * (The default weight is 1024 - so there's no practical
8728 * limitation from this.)
8729 */
8730 if (shares < MIN_SHARES) 8642 if (shares < MIN_SHARES)
8731 shares = MIN_SHARES; 8643 shares = MIN_SHARES;
8644 else if (shares > MAX_SHARES)
8645 shares = MAX_SHARES;
8732 8646
8733 mutex_lock(&shares_mutex); 8647 mutex_lock(&shares_mutex);
8734 if (tg->shares == shares) 8648 if (tg->shares == shares)
@@ -8753,7 +8667,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8753 * force a rebalance 8667 * force a rebalance
8754 */ 8668 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8669 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids); 8670 set_se_shares(tg->se[i], shares);
8757 } 8671 }
8758 8672
8759 /* 8673 /*
@@ -9072,7 +8986,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9072#endif 8986#endif
9073 8987
9074#ifdef CONFIG_RT_GROUP_SCHED 8988#ifdef CONFIG_RT_GROUP_SCHED
9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8989static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9076 s64 val) 8990 s64 val)
9077{ 8991{
9078 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 8992 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);