aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt15
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/module.c15
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/snapshot.c41
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/sched.c58
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c277
-rw-r--r--kernel/time/ntp.c23
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c6
12 files changed, 297 insertions, 150 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70fa6a3..9fdba03dc1fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
55config RCU_TRACE 69config RCU_TRACE
56 bool "Enable tracing for RCU - currently stats in debugfs" 70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
57 select DEBUG_FS 72 select DEBUG_FS
58 default y 73 default y
59 help 74 help
diff --git a/kernel/exit.c b/kernel/exit.c
index cd20bf07e9e3..53872bf993fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1378,7 +1378,7 @@ unlock_sig:
1378 if (!retval && infop) 1378 if (!retval && infop)
1379 retval = put_user(0, &infop->si_errno); 1379 retval = put_user(0, &infop->si_errno);
1380 if (!retval && infop) 1380 if (!retval && infop)
1381 retval = put_user(why, &infop->si_code); 1381 retval = put_user((short)why, &infop->si_code);
1382 if (!retval && infop) 1382 if (!retval && infop)
1383 retval = put_user(exit_code, &infop->si_status); 1383 retval = put_user(exit_code, &infop->si_status);
1384 if (!retval && infop) 1384 if (!retval && infop)
diff --git a/kernel/module.c b/kernel/module.c
index be4807fb90e4..5d437bffd8dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2178,10 +2178,20 @@ sys_init_module(void __user *umod,
2178 wake_up(&module_wq); 2178 wake_up(&module_wq);
2179 return ret; 2179 return ret;
2180 } 2180 }
2181 if (ret > 0) {
2182 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
2183 "it should follow 0/-E convention\n"
2184 KERN_WARNING "%s: loading module anyway...\n",
2185 __func__, mod->name, ret,
2186 __func__);
2187 dump_stack();
2188 }
2181 2189
2182 /* Now it's a first class citizen! */ 2190 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2183 mutex_lock(&module_mutex);
2184 mod->state = MODULE_STATE_LIVE; 2191 mod->state = MODULE_STATE_LIVE;
2192 wake_up(&module_wq);
2193
2194 mutex_lock(&module_mutex);
2185 /* Drop initial reference. */ 2195 /* Drop initial reference. */
2186 module_put(mod); 2196 module_put(mod);
2187 unwind_remove_table(mod->unwind_info, 1); 2197 unwind_remove_table(mod->unwind_info, 1);
@@ -2190,7 +2200,6 @@ sys_init_module(void __user *umod,
2190 mod->init_size = 0; 2200 mod->init_size = 0;
2191 mod->init_text_size = 0; 2201 mod->init_text_size = 0;
2192 mutex_unlock(&module_mutex); 2202 mutex_unlock(&module_mutex);
2193 wake_up(&module_wq);
2194 2203
2195 return 0; 2204 return 0;
2196} 2205}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 79833170bb9c..6233f3b4ae66 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
190 notification of APM "events" (e.g. battery status change). 190 notification of APM "events" (e.g. battery status change).
191 191
192 In order to use APM, you will need supporting software. For location 192 In order to use APM, you will need supporting software. For location
193 and more information, read <file:Documentation/pm.txt> and the 193 and more information, read <file:Documentation/power/pm.txt> and the
194 Battery Powered Linux mini-HOWTO, available from 194 Battery Powered Linux mini-HOWTO, available from
195 <http://www.tldp.org/docs.html#howto>. 195 <http://www.tldp.org/docs.html#howto>.
196 196
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020cabb4c..5f91a07c4eac 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
447 * of @bm->cur_zone_bm are updated. 447 * of @bm->cur_zone_bm are updated.
448 */ 448 */
449 449
450static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 450static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
451 void **addr, unsigned int *bit_nr) 451 void **addr, unsigned int *bit_nr)
452{ 452{
453 struct zone_bitmap *zone_bm; 453 struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
461 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 461 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
462 zone_bm = zone_bm->next; 462 zone_bm = zone_bm->next;
463 463
464 BUG_ON(!zone_bm); 464 if (!zone_bm)
465 return -EFAULT;
465 } 466 }
466 bm->cur.zone_bm = zone_bm; 467 bm->cur.zone_bm = zone_bm;
467 } 468 }
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
479 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
480 *bit_nr = pfn % BM_BITS_PER_CHUNK; 481 *bit_nr = pfn % BM_BITS_PER_CHUNK;
481 *addr = bb->data + pfn / BM_BITS_PER_CHUNK; 482 *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
483 return 0;
482} 484}
483 485
484static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) 486static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
485{ 487{
486 void *addr; 488 void *addr;
487 unsigned int bit; 489 unsigned int bit;
490 int error;
488 491
489 memory_bm_find_bit(bm, pfn, &addr, &bit); 492 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
493 BUG_ON(error);
490 set_bit(bit, addr); 494 set_bit(bit, addr);
491} 495}
492 496
497static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
498{
499 void *addr;
500 unsigned int bit;
501 int error;
502
503 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
504 if (!error)
505 set_bit(bit, addr);
506 return error;
507}
508
493static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) 509static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
494{ 510{
495 void *addr; 511 void *addr;
496 unsigned int bit; 512 unsigned int bit;
513 int error;
497 514
498 memory_bm_find_bit(bm, pfn, &addr, &bit); 515 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
516 BUG_ON(error);
499 clear_bit(bit, addr); 517 clear_bit(bit, addr);
500} 518}
501 519
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
503{ 521{
504 void *addr; 522 void *addr;
505 unsigned int bit; 523 unsigned int bit;
524 int error;
506 525
507 memory_bm_find_bit(bm, pfn, &addr, &bit); 526 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
527 BUG_ON(error);
508 return test_bit(bit, addr); 528 return test_bit(bit, addr);
509} 529}
510 530
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
709 region->end_pfn << PAGE_SHIFT); 729 region->end_pfn << PAGE_SHIFT);
710 730
711 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 731 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
712 if (pfn_valid(pfn)) 732 if (pfn_valid(pfn)) {
713 memory_bm_set_bit(bm, pfn); 733 /*
734 * It is safe to ignore the result of
735 * mem_bm_set_bit_check() here, since we won't
736 * touch the PFNs for which the error is
737 * returned anyway.
738 */
739 mem_bm_set_bit_check(bm, pfn);
740 }
714 } 741 }
715} 742}
716 743
diff --git a/kernel/relay.c b/kernel/relay.c
index d080b9d161a7..4c035a8a248c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1066,7 +1066,7 @@ static int subbuf_splice_actor(struct file *in,
1066 unsigned int flags, 1066 unsigned int flags,
1067 int *nonpad_ret) 1067 int *nonpad_ret)
1068{ 1068{
1069 unsigned int pidx, poff, total_len, subbuf_pages, ret; 1069 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
1070 struct rchan_buf *rbuf = in->private_data; 1070 struct rchan_buf *rbuf = in->private_data;
1071 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1071 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1072 uint64_t pos = (uint64_t) *ppos; 1072 uint64_t pos = (uint64_t) *ppos;
@@ -1097,8 +1097,9 @@ static int subbuf_splice_actor(struct file *in,
1097 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1097 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1098 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1098 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1099 poff = read_start & ~PAGE_MASK; 1099 poff = read_start & ~PAGE_MASK;
1100 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
1100 1101
1101 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { 1102 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1102 unsigned int this_len, this_end, private; 1103 unsigned int this_len, this_end, private;
1103 unsigned int cur_pos = read_start + total_len; 1104 unsigned int cur_pos = read_start + total_len;
1104 1105
diff --git a/kernel/sched.c b/kernel/sched.c
index 52b98675acb2..3f7c5eb254e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct cfs_rq {
301 /* 'curr' points to currently running entity on this cfs_rq. 301 /* 'curr' points to currently running entity on this cfs_rq.
302 * It is set to NULL otherwise (i.e when none are currently running). 302 * It is set to NULL otherwise (i.e when none are currently running).
303 */ 303 */
304 struct sched_entity *curr; 304 struct sched_entity *curr, *next;
305 305
306 unsigned long nr_spread_over; 306 unsigned long nr_spread_over;
307 307
@@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1084 u64 tmp; 1084 u64 tmp;
1085 1085
1086 if (unlikely(!lw->inv_weight)) 1086 if (unlikely(!lw->inv_weight))
1087 lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; 1087 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
1088 1088
1089 tmp = (u64)delta_exec * weight; 1089 tmp = (u64)delta_exec * weight;
1090 /* 1090 /*
@@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1108static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1108static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1109{ 1109{
1110 lw->weight += inc; 1110 lw->weight += inc;
1111 lw->inv_weight = 0;
1111} 1112}
1112 1113
1113static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1114static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1114{ 1115{
1115 lw->weight -= dec; 1116 lw->weight -= dec;
1117 lw->inv_weight = 0;
1116} 1118}
1117 1119
1118/* 1120/*
@@ -1394,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1394{ 1396{
1395 s64 delta; 1397 s64 delta;
1396 1398
1399 /*
1400 * Buddy candidates are cache hot:
1401 */
1402 if (&p->se == cfs_rq_of(&p->se)->next)
1403 return 1;
1404
1397 if (p->sched_class != &fair_sched_class) 1405 if (p->sched_class != &fair_sched_class)
1398 return 0; 1406 return 0;
1399 1407
@@ -1853,10 +1861,11 @@ out_activate:
1853 schedstat_inc(p, se.nr_wakeups_remote); 1861 schedstat_inc(p, se.nr_wakeups_remote);
1854 update_rq_clock(rq); 1862 update_rq_clock(rq);
1855 activate_task(rq, p, 1); 1863 activate_task(rq, p, 1);
1856 check_preempt_curr(rq, p);
1857 success = 1; 1864 success = 1;
1858 1865
1859out_running: 1866out_running:
1867 check_preempt_curr(rq, p);
1868
1860 p->state = TASK_RUNNING; 1869 p->state = TASK_RUNNING;
1861#ifdef CONFIG_SMP 1870#ifdef CONFIG_SMP
1862 if (p->sched_class->task_wake_up) 1871 if (p->sched_class->task_wake_up)
@@ -1890,6 +1899,8 @@ static void __sched_fork(struct task_struct *p)
1890 p->se.exec_start = 0; 1899 p->se.exec_start = 0;
1891 p->se.sum_exec_runtime = 0; 1900 p->se.sum_exec_runtime = 0;
1892 p->se.prev_sum_exec_runtime = 0; 1901 p->se.prev_sum_exec_runtime = 0;
1902 p->se.last_wakeup = 0;
1903 p->se.avg_overlap = 0;
1893 1904
1894#ifdef CONFIG_SCHEDSTATS 1905#ifdef CONFIG_SCHEDSTATS
1895 p->se.wait_start = 0; 1906 p->se.wait_start = 0;
@@ -4268,11 +4279,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4268 oldprio = p->prio; 4279 oldprio = p->prio;
4269 on_rq = p->se.on_rq; 4280 on_rq = p->se.on_rq;
4270 running = task_current(rq, p); 4281 running = task_current(rq, p);
4271 if (on_rq) { 4282 if (on_rq)
4272 dequeue_task(rq, p, 0); 4283 dequeue_task(rq, p, 0);
4273 if (running) 4284 if (running)
4274 p->sched_class->put_prev_task(rq, p); 4285 p->sched_class->put_prev_task(rq, p);
4275 }
4276 4286
4277 if (rt_prio(prio)) 4287 if (rt_prio(prio))
4278 p->sched_class = &rt_sched_class; 4288 p->sched_class = &rt_sched_class;
@@ -4281,10 +4291,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4281 4291
4282 p->prio = prio; 4292 p->prio = prio;
4283 4293
4294 if (running)
4295 p->sched_class->set_curr_task(rq);
4284 if (on_rq) { 4296 if (on_rq) {
4285 if (running)
4286 p->sched_class->set_curr_task(rq);
4287
4288 enqueue_task(rq, p, 0); 4297 enqueue_task(rq, p, 0);
4289 4298
4290 check_class_changed(rq, p, prev_class, oldprio, running); 4299 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4581,19 +4590,17 @@ recheck:
4581 update_rq_clock(rq); 4590 update_rq_clock(rq);
4582 on_rq = p->se.on_rq; 4591 on_rq = p->se.on_rq;
4583 running = task_current(rq, p); 4592 running = task_current(rq, p);
4584 if (on_rq) { 4593 if (on_rq)
4585 deactivate_task(rq, p, 0); 4594 deactivate_task(rq, p, 0);
4586 if (running) 4595 if (running)
4587 p->sched_class->put_prev_task(rq, p); 4596 p->sched_class->put_prev_task(rq, p);
4588 }
4589 4597
4590 oldprio = p->prio; 4598 oldprio = p->prio;
4591 __setscheduler(rq, p, policy, param->sched_priority); 4599 __setscheduler(rq, p, policy, param->sched_priority);
4592 4600
4601 if (running)
4602 p->sched_class->set_curr_task(rq);
4593 if (on_rq) { 4603 if (on_rq) {
4594 if (running)
4595 p->sched_class->set_curr_task(rq);
4596
4597 activate_task(rq, p, 0); 4604 activate_task(rq, p, 0);
4598 4605
4599 check_class_changed(rq, p, prev_class, oldprio, running); 4606 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5881,7 +5888,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5881 spin_unlock_irq(&rq->lock); 5888 spin_unlock_irq(&rq->lock);
5882 break; 5889 break;
5883 5890
5884 case CPU_DOWN_PREPARE: 5891 case CPU_DYING:
5892 case CPU_DYING_FROZEN:
5885 /* Update our root-domain */ 5893 /* Update our root-domain */
5886 rq = cpu_rq(cpu); 5894 rq = cpu_rq(cpu);
5887 spin_lock_irqsave(&rq->lock, flags); 5895 spin_lock_irqsave(&rq->lock, flags);
@@ -7617,11 +7625,10 @@ void sched_move_task(struct task_struct *tsk)
7617 running = task_current(rq, tsk); 7625 running = task_current(rq, tsk);
7618 on_rq = tsk->se.on_rq; 7626 on_rq = tsk->se.on_rq;
7619 7627
7620 if (on_rq) { 7628 if (on_rq)
7621 dequeue_task(rq, tsk, 0); 7629 dequeue_task(rq, tsk, 0);
7622 if (unlikely(running)) 7630 if (unlikely(running))
7623 tsk->sched_class->put_prev_task(rq, tsk); 7631 tsk->sched_class->put_prev_task(rq, tsk);
7624 }
7625 7632
7626 set_task_rq(tsk, task_cpu(tsk)); 7633 set_task_rq(tsk, task_cpu(tsk));
7627 7634
@@ -7630,11 +7637,10 @@ void sched_move_task(struct task_struct *tsk)
7630 tsk->sched_class->moved_group(tsk); 7637 tsk->sched_class->moved_group(tsk);
7631#endif 7638#endif
7632 7639
7633 if (on_rq) { 7640 if (unlikely(running))
7634 if (unlikely(running)) 7641 tsk->sched_class->set_curr_task(rq);
7635 tsk->sched_class->set_curr_task(rq); 7642 if (on_rq)
7636 enqueue_task(rq, tsk, 0); 7643 enqueue_task(rq, tsk, 0);
7637 }
7638 7644
7639 task_rq_unlock(rq, &flags); 7645 task_rq_unlock(rq, &flags);
7640} 7646}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
288 PN(se.exec_start); 288 PN(se.exec_start);
289 PN(se.vruntime); 289 PN(se.vruntime);
290 PN(se.sum_exec_runtime); 290 PN(se.sum_exec_runtime);
291 PN(se.avg_overlap);
291 292
292 nr_switches = p->nvcsw + p->nivcsw; 293 nr_switches = p->nvcsw + p->nivcsw;
293 294
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e2a530515619..b85cac4b5e25 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73 73
74/* 74/*
75 * SCHED_OTHER wake-up granularity. 75 * SCHED_OTHER wake-up granularity.
76 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 77 *
78 * This option delays the preemption effects of decoupled workloads 78 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 79 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 80 * have immediate wakeup/sleep latencies.
81 */ 81 */
82unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 82unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
83 83
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 84const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 85
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
175 * Maintain a cache of leftmost tree entries (it is frequently 175 * Maintain a cache of leftmost tree entries (it is frequently
176 * used): 176 * used):
177 */ 177 */
178 if (leftmost) 178 if (leftmost) {
179 cfs_rq->rb_leftmost = &se->run_node; 179 cfs_rq->rb_leftmost = &se->run_node;
180 /*
181 * maintain cfs_rq->min_vruntime to be a monotonic increasing
182 * value tracking the leftmost vruntime in the tree.
183 */
184 cfs_rq->min_vruntime =
185 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
186 }
180 187
181 rb_link_node(&se->run_node, parent, link); 188 rb_link_node(&se->run_node, parent, link);
182 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 189 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
184 191
185static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 192static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
186{ 193{
187 if (cfs_rq->rb_leftmost == &se->run_node) 194 if (cfs_rq->rb_leftmost == &se->run_node) {
188 cfs_rq->rb_leftmost = rb_next(&se->run_node); 195 struct rb_node *next_node;
196 struct sched_entity *next;
197
198 next_node = rb_next(&se->run_node);
199 cfs_rq->rb_leftmost = next_node;
200
201 if (next_node) {
202 next = rb_entry(next_node,
203 struct sched_entity, run_node);
204 cfs_rq->min_vruntime =
205 max_vruntime(cfs_rq->min_vruntime,
206 next->vruntime);
207 }
208 }
209
210 if (cfs_rq->next == se)
211 cfs_rq->next = NULL;
189 212
190 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 213 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
191} 214}
@@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
260 */ 283 */
261static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 284static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
262{ 285{
263 u64 slice = __sched_period(cfs_rq->nr_running); 286 return calc_delta_mine(__sched_period(cfs_rq->nr_running),
264 287 se->load.weight, &cfs_rq->load);
265 slice *= se->load.weight;
266 do_div(slice, cfs_rq->load.weight);
267
268 return slice;
269} 288}
270 289
271/* 290/*
@@ -303,7 +322,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
303 unsigned long delta_exec) 322 unsigned long delta_exec)
304{ 323{
305 unsigned long delta_exec_weighted; 324 unsigned long delta_exec_weighted;
306 u64 vruntime;
307 325
308 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 326 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
309 327
@@ -315,19 +333,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
315 &curr->load); 333 &curr->load);
316 } 334 }
317 curr->vruntime += delta_exec_weighted; 335 curr->vruntime += delta_exec_weighted;
318
319 /*
320 * maintain cfs_rq->min_vruntime to be a monotonic increasing
321 * value tracking the leftmost vruntime in the tree.
322 */
323 if (first_fair(cfs_rq)) {
324 vruntime = min_vruntime(curr->vruntime,
325 __pick_next_entity(cfs_rq)->vruntime);
326 } else
327 vruntime = curr->vruntime;
328
329 cfs_rq->min_vruntime =
330 max_vruntime(cfs_rq->min_vruntime, vruntime);
331} 336}
332 337
333static void update_curr(struct cfs_rq *cfs_rq) 338static void update_curr(struct cfs_rq *cfs_rq)
@@ -493,7 +498,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
493{ 498{
494 u64 vruntime; 499 u64 vruntime;
495 500
496 vruntime = cfs_rq->min_vruntime; 501 if (first_fair(cfs_rq)) {
502 vruntime = min_vruntime(cfs_rq->min_vruntime,
503 __pick_next_entity(cfs_rq)->vruntime);
504 } else
505 vruntime = cfs_rq->min_vruntime;
497 506
498 if (sched_feat(TREE_AVG)) { 507 if (sched_feat(TREE_AVG)) {
499 struct sched_entity *last = __pick_last_entity(cfs_rq); 508 struct sched_entity *last = __pick_last_entity(cfs_rq);
@@ -515,8 +524,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
515 524
516 if (!initial) { 525 if (!initial) {
517 /* sleeps upto a single latency don't count. */ 526 /* sleeps upto a single latency don't count. */
518 if (sched_feat(NEW_FAIR_SLEEPERS)) 527 if (sched_feat(NEW_FAIR_SLEEPERS)) {
519 vruntime -= sysctl_sched_latency; 528 vruntime -= calc_delta_fair(sysctl_sched_latency,
529 &cfs_rq->load);
530 }
520 531
521 /* ensure we never gain time by being placed backwards. */ 532 /* ensure we never gain time by being placed backwards. */
522 vruntime = max_vruntime(se->vruntime, vruntime); 533 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -545,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
545 account_entity_enqueue(cfs_rq, se); 556 account_entity_enqueue(cfs_rq, se);
546} 557}
547 558
559static void update_avg(u64 *avg, u64 sample)
560{
561 s64 diff = sample - *avg;
562 *avg += diff >> 3;
563}
564
565static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
566{
567 if (!se->last_wakeup)
568 return;
569
570 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
571 se->last_wakeup = 0;
572}
573
548static void 574static void
549dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 575dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
550{ 576{
@@ -555,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
555 581
556 update_stats_dequeue(cfs_rq, se); 582 update_stats_dequeue(cfs_rq, se);
557 if (sleep) { 583 if (sleep) {
584 update_avg_stats(cfs_rq, se);
558#ifdef CONFIG_SCHEDSTATS 585#ifdef CONFIG_SCHEDSTATS
559 if (entity_is_task(se)) { 586 if (entity_is_task(se)) {
560 struct task_struct *tsk = task_of(se); 587 struct task_struct *tsk = task_of(se);
@@ -616,12 +643,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
616 se->prev_sum_exec_runtime = se->sum_exec_runtime; 643 se->prev_sum_exec_runtime = se->sum_exec_runtime;
617} 644}
618 645
646static struct sched_entity *
647pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
648{
649 s64 diff, gran;
650
651 if (!cfs_rq->next)
652 return se;
653
654 diff = cfs_rq->next->vruntime - se->vruntime;
655 if (diff < 0)
656 return se;
657
658 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
659 if (diff > gran)
660 return se;
661
662 return cfs_rq->next;
663}
664
619static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 665static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
620{ 666{
621 struct sched_entity *se = NULL; 667 struct sched_entity *se = NULL;
622 668
623 if (first_fair(cfs_rq)) { 669 if (first_fair(cfs_rq)) {
624 se = __pick_next_entity(cfs_rq); 670 se = __pick_next_entity(cfs_rq);
671 se = pick_next(cfs_rq, se);
625 set_next_entity(cfs_rq, se); 672 set_next_entity(cfs_rq, se);
626 } 673 }
627 674
@@ -949,96 +996,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
949#endif 996#endif
950 997
951#ifdef CONFIG_SMP 998#ifdef CONFIG_SMP
952static int select_task_rq_fair(struct task_struct *p, int sync) 999
1000static const struct sched_class fair_sched_class;
1001
1002static int
1003wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1004 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1005 int idx, unsigned long load, unsigned long this_load,
1006 unsigned int imbalance)
953{ 1007{
954 int cpu, this_cpu; 1008 struct task_struct *curr = this_rq->curr;
955 struct rq *rq; 1009 unsigned long tl = this_load;
956 struct sched_domain *sd, *this_sd = NULL; 1010 unsigned long tl_per_task;
957 int new_cpu; 1011
1012 if (!(this_sd->flags & SD_WAKE_AFFINE))
1013 return 0;
1014
1015 /*
1016 * If the currently running task will sleep within
1017 * a reasonable amount of time then attract this newly
1018 * woken task:
1019 */
1020 if (sync && curr->sched_class == &fair_sched_class) {
1021 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1022 p->se.avg_overlap < sysctl_sched_migration_cost)
1023 return 1;
1024 }
1025
1026 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1027 tl_per_task = cpu_avg_load_per_task(this_cpu);
1028
1029 /*
1030 * If sync wakeup then subtract the (maximum possible)
1031 * effect of the currently running task from the load
1032 * of the current CPU:
1033 */
1034 if (sync)
1035 tl -= current->se.load.weight;
1036
1037 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1038 100*(tl + p->se.load.weight) <= imbalance*load) {
1039 /*
1040 * This domain has SD_WAKE_AFFINE and
1041 * p is cache cold in this domain, and
1042 * there is no bad imbalance.
1043 */
1044 schedstat_inc(this_sd, ttwu_move_affine);
1045 schedstat_inc(p, se.nr_wakeups_affine);
958 1046
959 cpu = task_cpu(p); 1047 return 1;
960 rq = task_rq(p); 1048 }
961 this_cpu = smp_processor_id(); 1049 return 0;
962 new_cpu = cpu; 1050}
963 1051
964 if (cpu == this_cpu) 1052static int select_task_rq_fair(struct task_struct *p, int sync)
965 goto out_set_cpu; 1053{
1054 struct sched_domain *sd, *this_sd = NULL;
1055 int prev_cpu, this_cpu, new_cpu;
1056 unsigned long load, this_load;
1057 struct rq *rq, *this_rq;
1058 unsigned int imbalance;
1059 int idx;
1060
1061 prev_cpu = task_cpu(p);
1062 rq = task_rq(p);
1063 this_cpu = smp_processor_id();
1064 this_rq = cpu_rq(this_cpu);
1065 new_cpu = prev_cpu;
966 1066
1067 /*
1068 * 'this_sd' is the first domain that both
1069 * this_cpu and prev_cpu are present in:
1070 */
967 for_each_domain(this_cpu, sd) { 1071 for_each_domain(this_cpu, sd) {
968 if (cpu_isset(cpu, sd->span)) { 1072 if (cpu_isset(prev_cpu, sd->span)) {
969 this_sd = sd; 1073 this_sd = sd;
970 break; 1074 break;
971 } 1075 }
972 } 1076 }
973 1077
974 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1078 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
975 goto out_set_cpu; 1079 goto out;
976 1080
977 /* 1081 /*
978 * Check for affine wakeup and passive balancing possibilities. 1082 * Check for affine wakeup and passive balancing possibilities.
979 */ 1083 */
980 if (this_sd) { 1084 if (!this_sd)
981 int idx = this_sd->wake_idx; 1085 goto out;
982 unsigned int imbalance;
983 unsigned long load, this_load;
984
985 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
986
987 load = source_load(cpu, idx);
988 this_load = target_load(this_cpu, idx);
989
990 new_cpu = this_cpu; /* Wake to this CPU if we can */
991
992 if (this_sd->flags & SD_WAKE_AFFINE) {
993 unsigned long tl = this_load;
994 unsigned long tl_per_task;
995
996 /*
997 * Attract cache-cold tasks on sync wakeups:
998 */
999 if (sync && !task_hot(p, rq->clock, this_sd))
1000 goto out_set_cpu;
1001
1002 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1003 tl_per_task = cpu_avg_load_per_task(this_cpu);
1004
1005 /*
1006 * If sync wakeup then subtract the (maximum possible)
1007 * effect of the currently running task from the load
1008 * of the current CPU:
1009 */
1010 if (sync)
1011 tl -= current->se.load.weight;
1012
1013 if ((tl <= load &&
1014 tl + target_load(cpu, idx) <= tl_per_task) ||
1015 100*(tl + p->se.load.weight) <= imbalance*load) {
1016 /*
1017 * This domain has SD_WAKE_AFFINE and
1018 * p is cache cold in this domain, and
1019 * there is no bad imbalance.
1020 */
1021 schedstat_inc(this_sd, ttwu_move_affine);
1022 schedstat_inc(p, se.nr_wakeups_affine);
1023 goto out_set_cpu;
1024 }
1025 }
1026 1086
1027 /* 1087 idx = this_sd->wake_idx;
1028 * Start passive balancing when half the imbalance_pct 1088
1029 * limit is reached. 1089 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1030 */ 1090
1031 if (this_sd->flags & SD_WAKE_BALANCE) { 1091 load = source_load(prev_cpu, idx);
1032 if (imbalance*this_load <= 100*load) { 1092 this_load = target_load(this_cpu, idx);
1033 schedstat_inc(this_sd, ttwu_move_balance); 1093
1034 schedstat_inc(p, se.nr_wakeups_passive); 1094 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1035 goto out_set_cpu; 1095 load, this_load, imbalance))
1036 } 1096 return this_cpu;
1097
1098 if (prev_cpu == this_cpu)
1099 goto out;
1100
1101 /*
1102 * Start passive balancing when half the imbalance_pct
1103 * limit is reached.
1104 */
1105 if (this_sd->flags & SD_WAKE_BALANCE) {
1106 if (imbalance*this_load <= 100*load) {
1107 schedstat_inc(this_sd, ttwu_move_balance);
1108 schedstat_inc(p, se.nr_wakeups_passive);
1109 return this_cpu;
1037 } 1110 }
1038 } 1111 }
1039 1112
1040 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1113out:
1041out_set_cpu:
1042 return wake_idle(new_cpu, p); 1114 return wake_idle(new_cpu, p);
1043} 1115}
1044#endif /* CONFIG_SMP */ 1116#endif /* CONFIG_SMP */
@@ -1060,6 +1132,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1060 resched_task(curr); 1132 resched_task(curr);
1061 return; 1133 return;
1062 } 1134 }
1135
1136 se->last_wakeup = se->sum_exec_runtime;
1137 if (unlikely(se == pse))
1138 return;
1139
1140 cfs_rq_of(pse)->next = pse;
1141
1063 /* 1142 /*
1064 * Batch tasks do not preempt (their preemption is driven by 1143 * Batch tasks do not preempt (their preemption is driven by
1065 * the tick): 1144 * the tick):
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b5910e7ab..5fd9b9469770 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
42long time_freq; /* frequency offset (scaled ppm)*/ 42long time_freq; /* frequency offset (scaled ppm)*/
43static long time_reftime; /* time at last adjustment (s) */ 43static long time_reftime; /* time at last adjustment (s) */
44long time_adjust; 44long time_adjust;
45static long ntp_tick_adj;
45 46
46static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
47{ 48{
48 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
49 << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
50 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 51 second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
51 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 tick_length_base = second_length; 54 tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
342 freq_adj = shift_right(freq_adj, time_constant * 2 + 343 freq_adj = shift_right(freq_adj, time_constant * 2 +
343 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); 344 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
344 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { 345 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
346 u64 utemp64;
345 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); 347 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
346 if (time_offset < 0) { 348 if (time_offset < 0) {
347 temp64 = -temp64; 349 utemp64 = -temp64;
348 do_div(temp64, mtemp); 350 do_div(utemp64, mtemp);
349 freq_adj -= temp64; 351 freq_adj -= utemp64;
350 } else { 352 } else {
351 do_div(temp64, mtemp); 353 utemp64 = temp64;
352 freq_adj += temp64; 354 do_div(utemp64, mtemp);
355 freq_adj += utemp64;
353 } 356 }
354 } 357 }
355 freq_adj += time_freq; 358 freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
400 notify_cmos_timer(); 403 notify_cmos_timer();
401 return(result); 404 return(result);
402} 405}
406
407static int __init ntp_tick_adj_setup(char *str)
408{
409 ntp_tick_adj = simple_strtol(str, NULL, 0);
410 return 1;
411}
412
413__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2968298f8f36..686da821d376 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -640,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
640 640
641 if (ts->sched_timer.base) 641 if (ts->sched_timer.base)
642 hrtimer_cancel(&ts->sched_timer); 642 hrtimer_cancel(&ts->sched_timer);
643 ts->tick_stopped = 0; 643
644 ts->nohz_mode = NOHZ_MODE_INACTIVE; 644 ts->nohz_mode = NOHZ_MODE_INACTIVE;
645} 645}
646#endif /* HIGH_RES_TIMERS */ 646#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb050fe2..671af612b768 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,8 +187,7 @@ static void change_clocksource(void)
187 187
188 clock->error = 0; 188 clock->error = 0;
189 clock->xtime_nsec = 0; 189 clock->xtime_nsec = 0;
190 clocksource_calculate_interval(clock, 190 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
191 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
192 191
193 tick_clock_notify(); 192 tick_clock_notify();
194 193
@@ -245,8 +244,7 @@ void __init timekeeping_init(void)
245 ntp_clear(); 244 ntp_clear();
246 245
247 clock = clocksource_get_next(); 246 clock = clocksource_get_next();
248 clocksource_calculate_interval(clock, 247 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
249 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
250 clock->cycle_last = clocksource_read(clock); 248 clock->cycle_last = clocksource_read(clock);
251 249
252 xtime.tv_sec = sec; 250 xtime.tv_sec = sec;