aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenkatesh Pallipadi <venki@google.com>2011-04-14 13:30:53 -0400
committerIngo Molnar <mingo@elte.hu>2011-04-19 04:08:38 -0400
commit2f36825b176f67e5c5228aa33d828bc39718811f (patch)
tree065130f8eb74a419d8ddf1856ccc64ee7b195843
parent69c80f3e9d3c569f8a3cee94ba1a324b5a7fa6b9 (diff)
sched: Next buddy hint on sleep and preempt path
When a task in a taskgroup sleeps, pick_next_task starts all the way back at the root and picks the task/taskgroup with the min vruntime across all runnable tasks. But when there are many frequently sleeping tasks across different taskgroups, it makes better sense to stay with same taskgroup for its slice period (or until all tasks in the taskgroup sleeps) instead of switching cross taskgroup on each sleep after a short runtime. This helps specifically where taskgroups corresponds to a process with multiple threads. The change reduces the number of CR3 switches in this case. Example: Two taskgroups with 2 threads each which are running for 2ms and sleeping for 1ms. Looking at sched:sched_switch shows: BEFORE: taskgroup_1 threads [5004, 5005], taskgroup_2 threads [5016, 5017] cpu-soaker-5004 [003] 3683.391089 cpu-soaker-5016 [003] 3683.393106 cpu-soaker-5005 [003] 3683.395119 cpu-soaker-5017 [003] 3683.397130 cpu-soaker-5004 [003] 3683.399143 cpu-soaker-5016 [003] 3683.401155 cpu-soaker-5005 [003] 3683.403168 cpu-soaker-5017 [003] 3683.405170 AFTER: taskgroup_1 threads [21890, 21891], taskgroup_2 threads [21934, 21935] cpu-soaker-21890 [003] 865.895494 cpu-soaker-21935 [003] 865.897506 cpu-soaker-21934 [003] 865.899520 cpu-soaker-21935 [003] 865.901532 cpu-soaker-21934 [003] 865.903543 cpu-soaker-21935 [003] 865.905546 cpu-soaker-21891 [003] 865.907548 cpu-soaker-21890 [003] 865.909560 cpu-soaker-21891 [003] 865.911571 cpu-soaker-21890 [003] 865.913582 cpu-soaker-21891 [003] 865.915594 cpu-soaker-21934 [003] 865.917606 Similar problem is there when there are multiple taskgroups and say a task A preempts currently running task B of taskgroup_1. On schedule, pick_next_task can pick an unrelated task on taskgroup_2. Here it would be better to give some preference to task B on pick_next_task. A simple (may be extreme case) benchmark I tried was tbench with 2 tbench client processes with 2 threads each running on a single CPU. Avg throughput across 5 50 sec runs was: BEFORE: 105.84 MB/sec AFTER: 112.42 MB/sec Signed-off-by: Venkatesh Pallipadi <venki@google.com> Acked-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1302802253-25760-1-git-send-email-venki@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched_fair.c26
1 files changed, 23 insertions, 3 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 501ab637b94b..5280272cce3e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1344,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1344 hrtick_update(rq); 1344 hrtick_update(rq);
1345} 1345}
1346 1346
1347static void set_next_buddy(struct sched_entity *se);
1348
1347/* 1349/*
1348 * The dequeue_task method is called before nr_running is 1350 * The dequeue_task method is called before nr_running is
1349 * decreased. We remove the task from the rbtree and 1351 * decreased. We remove the task from the rbtree and
@@ -1353,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1353{ 1355{
1354 struct cfs_rq *cfs_rq; 1356 struct cfs_rq *cfs_rq;
1355 struct sched_entity *se = &p->se; 1357 struct sched_entity *se = &p->se;
1358 int task_sleep = flags & DEQUEUE_SLEEP;
1356 1359
1357 for_each_sched_entity(se) { 1360 for_each_sched_entity(se) {
1358 cfs_rq = cfs_rq_of(se); 1361 cfs_rq = cfs_rq_of(se);
1359 dequeue_entity(cfs_rq, se, flags); 1362 dequeue_entity(cfs_rq, se, flags);
1360 1363
1361 /* Don't dequeue parent if it has other entities besides us */ 1364 /* Don't dequeue parent if it has other entities besides us */
1362 if (cfs_rq->load.weight) 1365 if (cfs_rq->load.weight) {
1366 /*
1367 * Bias pick_next to pick a task from this cfs_rq, as
1368 * p is sleeping when it is within its sched_slice.
1369 */
1370 if (task_sleep && parent_entity(se))
1371 set_next_buddy(parent_entity(se));
1363 break; 1372 break;
1373 }
1364 flags |= DEQUEUE_SLEEP; 1374 flags |= DEQUEUE_SLEEP;
1365 } 1375 }
1366 1376
@@ -1877,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1877 struct sched_entity *se = &curr->se, *pse = &p->se; 1887 struct sched_entity *se = &curr->se, *pse = &p->se;
1878 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1888 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1879 int scale = cfs_rq->nr_running >= sched_nr_latency; 1889 int scale = cfs_rq->nr_running >= sched_nr_latency;
1890 int next_buddy_marked = 0;
1880 1891
1881 if (unlikely(se == pse)) 1892 if (unlikely(se == pse))
1882 return; 1893 return;
1883 1894
1884 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1895 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1885 set_next_buddy(pse); 1896 set_next_buddy(pse);
1897 next_buddy_marked = 1;
1898 }
1886 1899
1887 /* 1900 /*
1888 * We can come here with TIF_NEED_RESCHED already set from new task 1901 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1910,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1910 update_curr(cfs_rq); 1923 update_curr(cfs_rq);
1911 find_matching_se(&se, &pse); 1924 find_matching_se(&se, &pse);
1912 BUG_ON(!pse); 1925 BUG_ON(!pse);
1913 if (wakeup_preempt_entity(se, pse) == 1) 1926 if (wakeup_preempt_entity(se, pse) == 1) {
1927 /*
1928 * Bias pick_next to pick the sched entity that is
1929 * triggering this preemption.
1930 */
1931 if (!next_buddy_marked)
1932 set_next_buddy(pse);
1914 goto preempt; 1933 goto preempt;
1934 }
1915 1935
1916 return; 1936 return;
1917 1937