From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- kernel/sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..ca098bf4cc65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3978,6 +3978,16 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_flush_plug(prev); + raw_spin_lock(&rq->lock); + } + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) @@ -5333,6 +5343,7 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; @@ -5348,6 +5359,7 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; -- cgit v1.2.2 From 16addf954d3954a72fd56abc02ffcba3c18529a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Mar 2011 09:34:53 -0700 Subject: sched: Fix yield_to kernel-doc Add missing function parameters for yield_to(): Warning(kernel/sched.c:5470): No description found for parameter 'p' Warning(kernel/sched.c:5470): No description found for parameter 'preempt' Signed-off-by: Randy Dunlap Cc: Peter Zijlstra LKML-Reference: <20110318093453.8f7489a4.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 58d66ea7d200..052120d67706 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5467,6 +5467,8 @@ EXPORT_SYMBOL(yield); * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. -- cgit v1.2.2 From 20dd67407160eac577656cd2f8ee9a1fead960b8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 23 Mar 2011 13:17:23 +0200 Subject: sched: Remove unused 'rq' variable and cpu_rq() call from alloc_fair_sched_group() Signed-off-by: Sergey Senozhatsky Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <20110323111722.GA4244@swordfish.minsk.epam.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 052120d67706..a361e20ec2cd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8443,7 +8443,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8456,8 +8455,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) -- cgit v1.2.2 From b0e77598f87107001a00b8a4ece9c95e4254ccc4 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:24 -0700 Subject: userns: user namespaces: convert several capable() calls CAP_IPC_OWNER and CAP_IPC_LOCK can be checked against current_user_ns(), because the resource comes from current's own ipc namespace. setuid/setgid are to uids in own namespace, so again checks can be against current_user_ns(). Changelog: Jan 11: Use task_ns_capable() in place of sched_capable(). Jan 11: Use nsown_capable() as suggested by Bastian Blank. Jan 11: Clarify (hopefully) some logic in futex and sched.c Feb 15: use ns_capable for ipc, not nsown_capable Feb 23: let copy_ipcs handle setting ipc_ns->user_ns Feb 23: pass ns down rather than taking it from current [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a172494a9a63..480adeb63f8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4892,8 +4892,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -5221,7 +5224,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); -- cgit v1.2.2 From a51e91981870d013fcfcc08b0117997edbcbc7a7 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 24 Mar 2011 14:00:18 +0100 Subject: sched: Leave sched_setscheduler() earlier if possible, do not disturb SCHED_FIFO tasks sched_setscheduler() (in sched.c) is called in order of changing the scheduling policy and/or the real-time priority of a task. Thus, if we find out that neither of those are actually being modified, it is possible to return earlier and save the overhead of a full deactivate+activate cycle of the task in question. Beside that, if we have more than one SCHED_FIFO task with the same priority on the same rq (which means they share the same priority queue) having one of them changing its position in the priority queue because of a sched_setscheduler (as it happens by means of the deactivate+activate) that does not actually change the priority violates POSIX which states, for SCHED_FIFO: "If a thread whose policy or priority has been modified by pthread_setschedprio() is a running thread or is runnable, the effect on its position in the thread list depends on the direction of the modification, as follows: a. <...> b. If the priority is unchanged, the thread does not change position in the thread list. c. <...>" http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_08.html (ed: And the POSIX specification here does, briefly and somewhat unexpectedly, match what common sense tells us as well. ) Signed-off-by: Dario Faggioli Signed-off-by: Peter Zijlstra LKML-Reference: <1300971618.3960.82.camel@Palantir> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f592ce6f8616..a8845516ace6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5011,6 +5011,17 @@ recheck: return -EINVAL; } + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* -- cgit v1.2.2 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f592ce6f8616..865b433fac5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2309,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * - * NOTE: this function doesnt have to take the runqueue lock, + * NOTE: this function doesn't have to take the runqueue lock, * because all it wants to ensure is that the remote task enters * the kernel. If the IPI races and the task has been migrated * to another CPU then no harm is done and the purpose has been @@ -4997,7 +4997,7 @@ recheck: */ raw_spin_lock_irqsave(&p->pi_lock, flags); /* - * To be able to change p->policy safely, the apropriate + * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = __task_rq_lock(p); @@ -5705,7 +5705,7 @@ void show_state_filter(unsigned long state_filter) do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: + * console might take a lot of time: */ touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) -- cgit v1.2.2 From 49c022e657fbe661460d191fbe776a387132e2b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 10:14:25 +0200 Subject: sched: Clean up rebalance_domains() load-balance interval calculation Instead of the possible multiple-evaluation of num_online_cpus() in rebalance_domains() that Linus reported, avoid it altogether in the normal case since it's implemented with a Hamming weight function over a cpu bitmask which can be darn expensive for those with big iron. This also makes it cleaner, smaller and documents the code. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra LKML-Reference: <1301991265.2225.12.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a8845516ace6..17b4d226ee0d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6331,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif } + + update_max_interval(); + return NOTIFY_OK; } -- cgit v1.2.2 From 6631e635c65dc33cb798cc2f51d0ddd69ada6319 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 Apr 2011 08:08:20 -0700 Subject: block: don't flush plugged IO on forced preemtion scheduling We really only want to unplug the pending IO when the process actually goes to sleep. So move the test for flushing the plug up to the place where we actually deactivate the task - where we have properly checked for preemption and for the process really sleeping. Acked-by: Jens Axboe Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 48013633d792..a187c3fe027b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4111,20 +4111,20 @@ need_resched: try_to_wake_up_local(to_wakeup); } deactivate_task(rq, prev, DEQUEUE_SLEEP); + + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_flush_plug(prev); + raw_spin_lock(&rq->lock); + } } switch_count = &prev->nvcsw; } - /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. - */ - if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_flush_plug(prev); - raw_spin_lock(&rq->lock); - } - pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) -- cgit v1.2.2 From a237c1c5bc5dc5c76a21be922dca4826f3eca8ca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Apr 2011 13:27:55 +0200 Subject: block: let io_schedule() flush the plug inline Linus correctly observes that the most important dispatch cases are now done from kblockd, this isn't ideal for latency reasons. The original reason for switching dispatches out-of-line was to avoid too deep a stack, so by _only_ letting the "accidental" flush directly in schedule() be guarded by offload to kblockd, we should be able to get the best of both worlds. So add a blk_schedule_flush_plug() that offloads to kblockd, and only use that from the schedule() path. Signed-off-by: Jens Axboe --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a187c3fe027b..312f8b95c2d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4118,7 +4118,7 @@ need_resched: */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); - blk_flush_plug(prev); + blk_schedule_flush_plug(prev); raw_spin_lock(&rq->lock); } } -- cgit v1.2.2