7 files changed, 115 insertions, 78 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 72248d1b9e3f..ab81fdd4572b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2231,19 +2231,25 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
 * short of memory, might require taking the callback_mutex mutex.
 *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * The first call here from mm/page_alloc:get_page_from_freelist()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
- * hardwall cpusets - no allocation on a node outside the cpuset is
+ * no allocation on a node outside the cpuset is allowed (unless in
- * allowed (unless in interrupt, of course).
+ * interrupt, of course).
- *
+ *
- * The second loop doesn't even call here for GFP_ATOMIC requests
+ * The second pass through get_page_from_freelist() doesn't even call
- * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
- * and the checks below have the combined affect in the second loop of
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
- * the __alloc_pages() routine that:
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
 **/
 int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2255,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
        if (in_interrupt())
                return 1;
        node = z->zone_pgdat->node_id;
+        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
diff --git a/kernel/extable.c b/kernel/extable.c
index 7501b531ceed..7fe262855317 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
        return e;
 }
-static int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
            addr <= (unsigned long)_etext)
diff --git a/kernel/module.c b/kernel/module.c
index d24deb0dbbc9..bbe04862e1b0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -705,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
 void symbol_put_addr(void *addr)
 {
-        unsigned long flags;
+        struct module *modaddr;
-        spin_lock_irqsave(&modlist_lock, flags);
+        if (core_kernel_text((unsigned long)addr))
-        if (!kernel_text_address((unsigned long)addr))
+                return;
-                BUG();
-        module_put(module_text_address((unsigned long)addr));
+        if (!(modaddr = module_text_address((unsigned long)addr)))
-        spin_unlock_irqrestore(&modlist_lock, flags);
+                BUG();
+        module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4e0f0ec003f7..921c22ad16e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -148,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
-        task_lock(task);
        retval = -EPERM;
        if (task->pid <= 1)
-                goto bad;
+                goto out;
        if (task->tgid == current->tgid)
-                goto bad;
+                goto out;
+repeat:
+        /*
+         * Nasty, nasty.
+         *
+         * We want to hold both the task-lock and the
+         * tasklist_lock for writing at the same time.
+         * But that's against the rules (tasklist_lock
+         * is taken for reading by interrupts on other
+         * cpu's that may have task_lock).
+         */
+        task_lock(task);
+        local_irq_disable();
+        if (!write_trylock(&tasklist_lock)) {
+                local_irq_enable();
+                task_unlock(task);
+                do {
+                        cpu_relax();
+                } while (!write_can_lock(&tasklist_lock));
+                goto repeat;
+        }
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
@@ -166,17 +188,15 @@ int ptrace_attach(struct task_struct *task)
                                      ? PT_ATTACHED : 0);
        if (capable(CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
-        task_unlock(task);
-        write_lock_irq(&tasklist_lock);
        __ptrace_link(task, current);
-        write_unlock_irq(&tasklist_lock);
        force_sig_specific(SIGSTOP, task);
-        return 0;
 bad:
+        write_unlock_irq(&tasklist_lock);
        task_unlock(task);
+out:
        return retval;
 }
@@ -417,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
 */
 int ptrace_traceme(void)
 {
-        int ret;
+        int ret = -EPERM;
        /*
         * Are we already being traced?
         */
-        if (current->ptrace & PT_PTRACED)
+        task_lock(current);
-                return -EPERM;
+        if (!(current->ptrace & PT_PTRACED)) {
-        ret = security_ptrace(current->parent, current);
+                ret = security_ptrace(current->parent, current);
-        if (ret)
+                /*
-                return -EPERM;
+                 * Set the ptrace bit in the process ptrace flags.
-        /*
+                 */
-         * Set the ptrace bit in the process ptrace flags.
+                if (!ret)
-         */
+                        current->ptrace |= PT_PTRACED;
-        current->ptrace |= PT_PTRACED;
+        }
-        return 0;
+        task_unlock(current);
+        return ret;
 }
 /**
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 6d32ff26f948..2058f88c7bbb 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -479,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
        return 0;
 }
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
 int rcu_pending(int cpu)
 {
        return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
                __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 }
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+        struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+        return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
 void rcu_check_callbacks(int cpu, int user)
 {
        if (user || 
diff --git a/kernel/sched.c b/kernel/sched.c
index 4c64f85698ae..c13f1bd2df7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -665,55 +665,13 @@ static int effective_prio(task_t *p)
 }
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired, and switch periodically
- * regardless, to ensure that highly interactive tasks do not starve
- * the less fortunate for unreasonably long periods.
- */
-static inline int expired_starving(runqueue_t *rq)
-{
-        int limit;
-        /*
-         * Arrays were recently switched, all is well
-         */
-        if (!rq->expired_timestamp)
-                return 0;
-        limit = STARVATION_LIMIT * rq->nr_running;
-        /*
-         * It's time to switch arrays
-         */
-        if (jiffies - rq->expired_timestamp >= limit)
-                return 1;
-        /*
-         * There's a better selection in the expired array
-         */
-        if (rq->curr->static_prio > rq->best_expired_prio)
-                return 1;
-        /*
-         * All is well
-         */
-        return 0;
-}
-/*
 * __activate_task - move a task to the runqueue.
 */
 static void __activate_task(task_t *p, runqueue_t *rq)
 {
        prio_array_t *target = rq->active;
-        if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
+        if (batch_task(p))
                target = rq->expired;
        enqueue_task(p, target);
        rq->nr_running++;
@@ -2532,6 +2490,22 @@ unsigned long long current_sched_time(const task_t *tsk)
 }
 /*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+        ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+                (jiffies - (rq)->expired_timestamp >= \
+                        STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+                        ((rq)->curr->static_prio > (rq)->best_expired_prio))
+/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -2666,7 +2640,7 @@ void scheduler_tick(void)
                if (!rq->expired_timestamp)
                        rq->expired_timestamp = jiffies;
-                if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+                if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                        enqueue_task(p, rq->expired);
                        if (p->static_prio < rq->best_expired_prio)
                                rq->best_expired_prio = p->static_prio;
diff --git a/kernel/timer.c b/kernel/timer.c
index 67eaf0f54096..9e49deed468c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -541,6 +541,22 @@ found:
        }
        spin_unlock(&base->lock);
+        /*
+         * It can happen that other CPUs service timer IRQs and increment
+         * jiffies, but we have not yet got a local timer tick to process
+         * the timer wheels.  In that case, the expiry time can be before
+         * jiffies, but since the high-resolution timer here is relative to
+         * jiffies, the default expression when high-resolution timers are
+         * not active,
+         *
+         *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+         *
+         * would falsely evaluate to true.  If that is the case, just
+         * return jiffies so that we can immediately fire the local timer
+         */
+        if (time_before(expires, jiffies))
+                return jiffies;
        if (time_before(hr_expires, expires))
                return hr_expires;

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 72248d1b9e3f..ab81fdd4572b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -2231,19 +2231,25 @@ static const struct cpuset nearest_exclusive_ancestor(const struct cpuset cs)
2231	* So only GFP_KERNEL allocations, if all nodes in the cpuset are	2231	* So only GFP_KERNEL allocations, if all nodes in the cpuset are
2232	* short of memory, might require taking the callback_mutex mutex.	2232	* short of memory, might require taking the callback_mutex mutex.
2233	*	2233	*
2234	* The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()	2234	* The first call here from mm/page_alloc:get_page_from_freelist()
2235	* calls here with __GFP_HARDWALL always set in gfp_mask, enforcing	2235	* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
2236	* hardwall cpusets - no allocation on a node outside the cpuset is	2236	* no allocation on a node outside the cpuset is allowed (unless in
2237	* allowed (unless in interrupt, of course).	2237	* interrupt, of course).
2238	*	2238	*
2239	* The second loop doesn't even call here for GFP_ATOMIC requests	2239	* The second pass through get_page_from_freelist() doesn't even call
2240	* (if the __alloc_pages() local variable 'wait' is set). That check	2240	* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
2241	* and the checks below have the combined affect in the second loop of	2241	* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
2242	* the __alloc_pages() routine that:	2242	* in alloc_flags. That logic and the checks below have the combined
		2243	* affect that:
2243	* in_interrupt - any node ok (current task context irrelevant)	2244	* in_interrupt - any node ok (current task context irrelevant)
2244	* GFP_ATOMIC - any node ok	2245	* GFP_ATOMIC - any node ok
2245	* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok	2246	* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
2246	* GFP_USER - only nodes in current tasks mems allowed ok.	2247	* GFP_USER - only nodes in current tasks mems allowed ok.
		2248	*
		2249	* Rule:
		2250	* Don't call cpuset_zone_allowed() if you can't sleep, unless you
		2251	* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
		2252	* the code that might scan up ancestor cpusets and sleep.
2247	**/	2253	**/
2248		2254
2249	int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)	2255	int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2255,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2255	if (in_interrupt())	2261	if (in_interrupt())
2256	return 1;	2262	return 1;
2257	node = z->zone_pgdat->node_id;	2263	node = z->zone_pgdat->node_id;
		2264	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2258	if (node_isset(node, current->mems_allowed))	2265	if (node_isset(node, current->mems_allowed))
2259	return 1;	2266	return 1;
2260	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */	2267	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */


diff --git a/kernel/extable.c b/kernel/extable.c index 7501b531ceed..7fe262855317 100644 --- a/kernel/extable.c +++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40	return e;	40	return e;
41	}	41	}
42		42
43	static int core_kernel_text(unsigned long addr)	43	int core_kernel_text(unsigned long addr)
44	{	44	{
45	if (addr >= (unsigned long)_stext &&	45	if (addr >= (unsigned long)_stext &&
46	addr <= (unsigned long)_etext)	46	addr <= (unsigned long)_etext)


diff --git a/kernel/module.c b/kernel/module.c index d24deb0dbbc9..bbe04862e1b0 100644 --- a/kernel/module.c +++ b/kernel/module.c
@@ -705,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
705		705
706	void symbol_put_addr(void *addr)	706	void symbol_put_addr(void *addr)
707	{	707	{
708	unsigned long flags;	708	struct module *modaddr;
709		709
710	spin_lock_irqsave(&modlist_lock, flags);	710	if (core_kernel_text((unsigned long)addr))
711	if (!kernel_text_address((unsigned long)addr))	711	return;
712	BUG();
713		712
714	module_put(module_text_address((unsigned long)addr));	713	if (!(modaddr = module_text_address((unsigned long)addr)))
715	spin_unlock_irqrestore(&modlist_lock, flags);	714	BUG();
		715	module_put(modaddr);
716	}	716	}
717	EXPORT_SYMBOL_GPL(symbol_put_addr);	717	EXPORT_SYMBOL_GPL(symbol_put_addr);
718		718


diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4e0f0ec003f7..921c22ad16e4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c
@@ -148,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
148	int ptrace_attach(struct task_struct *task)	148	int ptrace_attach(struct task_struct *task)
149	{	149	{
150	int retval;	150	int retval;
151	task_lock(task);	151
152	retval = -EPERM;	152	retval = -EPERM;
153	if (task->pid <= 1)	153	if (task->pid <= 1)
154	goto bad;	154	goto out;
155	if (task->tgid == current->tgid)	155	if (task->tgid == current->tgid)
156	goto bad;	156	goto out;
		157
		158	repeat:
		159	/*
		160	* Nasty, nasty.
		161	*
		162	* We want to hold both the task-lock and the
		163	* tasklist_lock for writing at the same time.
		164	* But that's against the rules (tasklist_lock
		165	* is taken for reading by interrupts on other
		166	* cpu's that may have task_lock).
		167	*/
		168	task_lock(task);
		169	local_irq_disable();
		170	if (!write_trylock(&tasklist_lock)) {
		171	local_irq_enable();
		172	task_unlock(task);
		173	do {
		174	cpu_relax();
		175	} while (!write_can_lock(&tasklist_lock));
		176	goto repeat;
		177	}
		178
157	/* the same process cannot be attached many times */	179	/* the same process cannot be attached many times */
158	if (task->ptrace & PT_PTRACED)	180	if (task->ptrace & PT_PTRACED)
159	goto bad;	181	goto bad;
@@ -166,17 +188,15 @@ int ptrace_attach(struct task_struct *task)
166	? PT_ATTACHED : 0);	188	? PT_ATTACHED : 0);
167	if (capable(CAP_SYS_PTRACE))	189	if (capable(CAP_SYS_PTRACE))
168	task->ptrace \|= PT_PTRACE_CAP;	190	task->ptrace \|= PT_PTRACE_CAP;
169	task_unlock(task);
170		191
171	write_lock_irq(&tasklist_lock);
172	__ptrace_link(task, current);	192	__ptrace_link(task, current);
173	write_unlock_irq(&tasklist_lock);
174		193
175	force_sig_specific(SIGSTOP, task);	194	force_sig_specific(SIGSTOP, task);
176	return 0;
177		195
178	bad:	196	bad:
		197	write_unlock_irq(&tasklist_lock);
179	task_unlock(task);	198	task_unlock(task);
		199	out:
180	return retval;	200	return retval;
181	}	201	}
182		202
@@ -417,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
417	*/	437	*/
418	int ptrace_traceme(void)	438	int ptrace_traceme(void)
419	{	439	{
420	int ret;	440	int ret = -EPERM;
421		441
422	/*	442	/*
423	* Are we already being traced?	443	* Are we already being traced?
424	*/	444	*/
425	if (current->ptrace & PT_PTRACED)	445	task_lock(current);
426	return -EPERM;	446	if (!(current->ptrace & PT_PTRACED)) {
427	ret = security_ptrace(current->parent, current);	447	ret = security_ptrace(current->parent, current);
428	if (ret)	448	/*
429	return -EPERM;	449	* Set the ptrace bit in the process ptrace flags.
430	/*	450	*/
431	* Set the ptrace bit in the process ptrace flags.	451	if (!ret)
432	*/	452	current->ptrace \|= PT_PTRACED;
433	current->ptrace \|= PT_PTRACED;	453	}
434	return 0;	454	task_unlock(current);
		455	return ret;
435	}	456	}
436		457
437	/**	458	/**


diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 6d32ff26f948..2058f88c7bbb 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c
@@ -479,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk rcp, struct rcu_data rdp)
479	return 0;	479	return 0;
480	}	480	}
481		481
		482	/*
		483	* Check to see if there is any immediate RCU-related work to be done
		484	* by the current CPU, returning 1 if so. This function is part of the
		485	* RCU implementation; it is -not- an exported member of the RCU API.
		486	*/
482	int rcu_pending(int cpu)	487	int rcu_pending(int cpu)
483	{	488	{
484	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) \|\|	489	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) \|\|
485	__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));	490	__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
486	}	491	}
487		492
		493	/*
		494	* Check to see if any future RCU-related work will need to be done
		495	* by the current CPU, even if none need be done immediately, returning
		496	* 1 if so. This function is part of the RCU implementation; it is -not-
		497	* an exported member of the RCU API.
		498	*/
		499	int rcu_needs_cpu(int cpu)
		500	{
		501	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
		502	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
		503
		504	return (!!rdp->curlist \|\| !!rdp_bh->curlist \|\| rcu_pending(cpu));
		505	}
		506
488	void rcu_check_callbacks(int cpu, int user)	507	void rcu_check_callbacks(int cpu, int user)
489	{	508	{
490	if (user \|\|	509	if (user \|\|


diff --git a/kernel/sched.c b/kernel/sched.c index 4c64f85698ae..c13f1bd2df7d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -665,55 +665,13 @@ static int effective_prio(task_t *p)
665	}	665	}
666		666
667	/*	667	/*
668	* We place interactive tasks back into the active array, if possible.
669	*
670	* To guarantee that this does not starve expired tasks we ignore the
671	* interactivity of a task if the first expired task had to wait more
672	* than a 'reasonable' amount of time. This deadline timeout is
673	* load-dependent, as the frequency of array switched decreases with
674	* increasing number of running tasks. We also ignore the interactivity
675	* if a better static_prio task has expired, and switch periodically
676	* regardless, to ensure that highly interactive tasks do not starve
677	* the less fortunate for unreasonably long periods.
678	*/
679	static inline int expired_starving(runqueue_t *rq)
680	{
681	int limit;
682
683	/*
684	* Arrays were recently switched, all is well
685	*/
686	if (!rq->expired_timestamp)
687	return 0;
688
689	limit = STARVATION_LIMIT * rq->nr_running;
690
691	/*
692	* It's time to switch arrays
693	*/
694	if (jiffies - rq->expired_timestamp >= limit)
695	return 1;
696
697	/*
698	* There's a better selection in the expired array
699	*/
700	if (rq->curr->static_prio > rq->best_expired_prio)
701	return 1;
702
703	/*
704	* All is well
705	*/
706	return 0;
707	}
708
709	/*
710	* __activate_task - move a task to the runqueue.	668	* __activate_task - move a task to the runqueue.
711	*/	669	*/
712	static void __activate_task(task_t p, runqueue_t rq)	670	static void __activate_task(task_t p, runqueue_t rq)
713	{	671	{
714	prio_array_t *target = rq->active;	672	prio_array_t *target = rq->active;
715		673
716	if (unlikely(batch_task(p) \|\| (expired_starving(rq) && !rt_task(p))))	674	if (batch_task(p))
717	target = rq->expired;	675	target = rq->expired;
718	enqueue_task(p, target);	676	enqueue_task(p, target);
719	rq->nr_running++;	677	rq->nr_running++;
@@ -2532,6 +2490,22 @@ unsigned long long current_sched_time(const task_t *tsk)
2532	}	2490	}
2533		2491
2534	/*	2492	/*
		2493	* We place interactive tasks back into the active array, if possible.
		2494	*
		2495	* To guarantee that this does not starve expired tasks we ignore the
		2496	* interactivity of a task if the first expired task had to wait more
		2497	* than a 'reasonable' amount of time. This deadline timeout is
		2498	* load-dependent, as the frequency of array switched decreases with
		2499	* increasing number of running tasks. We also ignore the interactivity
		2500	* if a better static_prio task has expired:
		2501	*/
		2502	#define EXPIRED_STARVING(rq) \
		2503	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
		2504	(jiffies - (rq)->expired_timestamp >= \
		2505	STARVATION_LIMIT * ((rq)->nr_running) + 1))) \|\| \
		2506	((rq)->curr->static_prio > (rq)->best_expired_prio))
		2507
		2508	/*
2535	* Account user cpu time to a process.	2509	* Account user cpu time to a process.
2536	* @p: the process that the cpu time gets accounted to	2510	* @p: the process that the cpu time gets accounted to
2537	* @hardirq_offset: the offset to subtract from hardirq_count()	2511	* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -2666,7 +2640,7 @@ void scheduler_tick(void)
2666		2640
2667	if (!rq->expired_timestamp)	2641	if (!rq->expired_timestamp)
2668	rq->expired_timestamp = jiffies;	2642	rq->expired_timestamp = jiffies;
2669	if (!TASK_INTERACTIVE(p) \|\| expired_starving(rq)) {	2643	if (!TASK_INTERACTIVE(p) \|\| EXPIRED_STARVING(rq)) {
2670	enqueue_task(p, rq->expired);	2644	enqueue_task(p, rq->expired);
2671	if (p->static_prio < rq->best_expired_prio)	2645	if (p->static_prio < rq->best_expired_prio)
2672	rq->best_expired_prio = p->static_prio;	2646	rq->best_expired_prio = p->static_prio;


diff --git a/kernel/timer.c b/kernel/timer.c index 67eaf0f54096..9e49deed468c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c
@@ -541,6 +541,22 @@ found:
541	}	541	}
542	spin_unlock(&base->lock);	542	spin_unlock(&base->lock);
543		543
		544	/*
		545	* It can happen that other CPUs service timer IRQs and increment
		546	* jiffies, but we have not yet got a local timer tick to process
		547	* the timer wheels. In that case, the expiry time can be before
		548	* jiffies, but since the high-resolution timer here is relative to
		549	* jiffies, the default expression when high-resolution timers are
		550	* not active,
		551	*
		552	* time_before(MAX_JIFFY_OFFSET + jiffies, expires)
		553	*
		554	* would falsely evaluate to true. If that is the case, just
		555	* return jiffies so that we can immediately fire the local timer
		556	*/
		557	if (time_before(expires, jiffies))
		558	return jiffies;
		559
544	if (time_before(hr_expires, expires))	560	if (time_before(hr_expires, expires))
545	return hr_expires;	561	return hr_expires;
546		562