/*
* kernel/sched.c
*
* Kernel scheduler and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
*
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1998-11-19 Implemented schedule_timeout() and related stuff
* by Andrea Arcangeli
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
* hybrid priority-list and round-robin design with
* an array-switch method of distributing timeslices
* and per-CPU runqueues. Cleanups and useful suggestions
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
* 2007-04-15 Work begun on replacing all interactivity tuning with a
* fair scheduling design by Con Kolivas.
* 2007-05-05 Load balancing (smp-nice) and other improvements
* by Peter Williams
* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
* Thomas Gleixner, Mike Kravetz
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
#include <linux/perf_event.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
#include <linux/freezer.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/pid_namespace.h>
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <linux/unistd.h>
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
/*
* These are the 'tuning knobs' of the scheduler:
*
* default timeslice is 100 msecs (used only for SCHED_RR tasks).
* Timeslices get refilled after they expire.
*/
#define DEF_TIMESLICE (100 * HZ / 1000)
/*
* single value that denotes runtime == period, ie unlimited time.
*/
#define RUNTIME_INF ((u64)~0ULL)
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
return 1;
return 0;
}
static inline int task_has_rt_policy(struct task_struct *p)
{
return rt_policy(p->policy);
}
/*
* This is the priority-queue data structure of the RT scheduling class:
*/
struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_RT_PRIO];
};
struct rt_bandwidth {
/* nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
ktime_t rt_period;
u64 rt_runtime;
struct hrtimer rt_period_timer;
};
static struct rt_bandwidth def_rt_bandwidth;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
container_of(timer, struct rt_bandwidth, rt_period_timer);
ktime_t now;
int overrun;
int idle = 0;
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, rt_b->rt_period);
if (!overrun)
break;
idle = do_sched_rt_period_timer(rt_b, overrun);
}
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
static
void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
rt_b->rt_period = ns_to_ktime(period);
rt_b->rt_runtime = runtime;
raw_spin_lock_init(&rt_b->rt_runtime_lock);
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
return;
raw_spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
unsigned long delta;
ktime_t soft, hard;
if (hrtimer_active(&rt_b->rt_period_timer))
break;
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
hard = hrtimer_get_expires(&rt_b->rt_period_timer);
delta = ktime_to_ns(ktime_sub(hard, soft));
__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
HRTIMER_MODE_ABS_PINNED, 0);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
hrtimer_cancel(&rt_b->rt_period_timer);
}
#endif
/*
* sched_domains_mutex serializes calls to arch_init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
#ifdef CONFIG_CGROUP_SCHED
#include <linux/cgroup.h>
struct cfs_rq;
static LIST_HEAD(task_groups);
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
struct rt_bandwidth rt_bandwidth;
#endif
struct rcu_head rcu;
struct list_head list;
struct task_group *parent;
struct list_head siblings;
struct list_head children;
};
#define root_task_group init_task_group
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
*/
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
static int root_task_group_empty(void)
{
return list_empty(&root_task_group.children);
}
#endif
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
/*
* A weight of 0 or 1 can cause arithmetics problems.
* A weight of a cfs_rq is the sum of weights of which entities
* are queued on this cfs_rq, so a weight of a entity should not be
* too large, so as the shares value of a task group.
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
#define MIN_SHARES 2
#define MAX_SHARES (1UL << 18)
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif
/* Default task group.
* Every task in system belong to this group at bootup.
*/
struct task_group init_task_group;
#endif /* CONFIG_CGROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
u64 exec_clock;
u64 min_vruntime;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct list_head tasks;
struct list_head *balance_iterator;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last;
unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SMP
/*
* the part of load.weight contributed by tasks
*/
unsigned long task_weight;
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
/*
* this cpu's part of tg->shares
*/
unsigned long shares;
/*
* load.weight at the time we set shares
*/
unsigned long rq_weight;
#endif
#endif
};
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
#endif
};
#ifdef CONFIG_SMP
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
* fully partitioning the member cpus from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct root_domain {
atomic_t refcount;
cpumask_var_t span;
cpumask_var_t online;
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
cpumask_var_t rto_mask;
atomic_t rto_count;
struct cpupri cpupri;
};
/*
* By default the system creates a single root-domain with all cpus as
* members (mimicking the global state we have today).
*/
static struct root_domain def_root_domain;
#endif /* CONFIG_SMP */
/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
/* runqueue lock: */
raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
struct cfs_rq cfs;
struct rt_rq rt;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned long nr_uninterruptible;
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
u64 clock_task;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned char idle_at_tick;
/* For active balancing */
int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
unsigned long avg_load_per_task;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
struct call_single_data hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
/* BKL stats */
unsigned int bkl_count;
#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq->cpu;
#else
return 0;
#endif
}
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
rcu_read_lock_sched_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
#ifdef CONFIG_CGROUP_SCHED
/*
* Return the group to which this tasks belongs.
*
* We use task_subsys_state_check() and extend the RCU verification
* with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
return container_of(css, struct task_group, css);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
#endif
#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu];
#endif
}
#else /* CONFIG_CGROUP_SCHED */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
#endif /* CONFIG_CGROUP_SCHED */
static void update_rq_clock_task(struct rq *rq, s64 delta);
static void update_rq_clock(struct rq *rq)
{
s64 delta;
if (rq->skip_clock_update)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
# define const_debug __read_mostly
#else
# define const_debug static const
#endif
/**
* runqueue_is_locked
* @cpu: the processor in question.
*
* Returns true if the current cpu runqueue is locked.
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
int runqueue_is_locked(int cpu)
{
return raw_spin_is_locked(&cpu_rq(cpu)->lock);
}
/*
* Debugging: various feature bits
*/
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
enum {
#include "sched_features.h"
};
#undef SCHED_FEAT
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
#include "sched_features.h"
0;
#undef SCHED_FEAT
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled) \
#name ,
static __read_mostly char *sched_feat_names[] = {
#include "sched_features.h"
NULL
};
#undef SCHED_FEAT
static int sched_feat_show(struct seq_file *m, void *v)
{
int i;
for (i = 0; sched_feat_names[i]; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
}
seq_puts(m, "\n");
return 0;
}
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
char *cmp;
int neg = 0;
int i;
if (cnt > 63)
cnt = 63;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
cmp = strstrip(buf);
if (strncmp(buf, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
for (i = 0; sched_feat_names[i]; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg)
sysctl_sched_features &= ~(1UL << i);
else
sysctl_sched_features |= (1UL << i);
break;
}
}
if (!sched_feat_names[i])
return -EINVAL;
*ppos += cnt;
return cnt;
}
static int sched_feat_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_feat_show, NULL);
}
static const struct file_operations sched_feat_fops = {
.open = sched_feat_open,
.write = sched_feat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
return 0;
}
late_initcall(sched_init_debug);
#endif
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* ratelimit for updating the group shares.
* default: 0.25ms
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
/*
* Inject some fuzzyness into changing the per-cpu group shares
* this avoids remote rq-locks at the expense of fairness.
* default: 4
*/
unsigned int sysctl_sched_shares_thresh = 4;
/*
* period over which we average the RT time consumption, measured
* in ms.
*
* default: 1s
*/
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
unsigned int sysctl_sched_rt_period = 1000000;
static __read_mostly int scheduler_running;
/*
* part of the period that we allow rt tasks to run in us.
* default: 0.95s
*/
int sysctl_sched_rt_runtime = 950000;
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
}
static inline u64 global_rt_runtime(void)
{
if (sysctl_sched_rt_runtime < 0)
return RUNTIME_INF;
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
return task_current(rq, p);
}
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
raw_spin_unlock_irq(&rq->lock);
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
static inline int task_running(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->oncpu;
#else
return task_current(rq, p);
#endif
}
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
/*
* We can optimise this out completely for !SMP, because the
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
next->oncpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock);
#else
raw_spin_unlock(&rq->lock);
#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
* After ->oncpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
prev->oncpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
* Check whether the task is waking, we use this to synchronize ->cpus_allowed
* against ttwu().
*/
static inline int task_is_waking(struct task_struct *p)
{
return unlikely(p->state == TASK_WAKING);
}
/*
* __task_rq_lock - lock the runqueue a given task resides on.
* Must be called interrupts disabled.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
}
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
local_irq_save(*flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
}
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
raw_spin_unlock(&rq->lock);
}
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
__releases(rq->lock)
{
raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
struct rq *rq;
local_irq_disable();
rq = this_rq();
raw_spin_lock(&rq->lock);
return rq;
}
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
*
* Its all a bit involved since we cannot program an hrt while holding the
* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
* reschedule event.
*
* When we get rescheduled we reprogram the hrtick_timer outside of the
* rq->lock.
*/
/*
* Use hrtick when:
* - enabled by features
* - hrtimer is actually high res
*/
static inline int hrtick_enabled(struct rq *rq)
{
if (!sched_feat(HRTICK))
return 0;
if (!cpu_active(cpu_of(rq)))
return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
static void hrtick_clear(struct rq *rq)
{
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
}
/*
* High-resolution timer tick.
* Runs from hardirq context with interrupts disabled.
*/
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
raw_spin_unlock(&rq->lock);
return HRTIMER_NORESTART;
}
#ifdef CONFIG_SMP
/*
* called from hardirq (IPI) context
*/
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
raw_spin_lock(&rq->lock);
hrtimer_restart(&rq->hrtick_timer);
rq->hrtick_csd_pending = 0;
raw_spin_unlock(&rq->lock);
}
/*
* Called to set the hrtick timer state.
*
* called with rq->lock held and irqs disabled
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
} else if (!rq->hrtick_csd_pending) {
__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
}
}
static int
hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int cpu = (int)(long)hcpu;
switch (action) {
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
hrtick_clear(cpu_rq(cpu));
return NOTIFY_OK;
}
return NOTIFY_DONE;
}
static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
#else
/*
* Called to set the hrtick timer state.
*
* called with rq->lock held and irqs disabled
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
{
#ifdef CONFIG_SMP
rq->hrtick_csd_pending = 0;
rq->hrtick_csd.flags = 0;
rq->hrtick_csd.func = __hrtick_start;
rq->hrtick_csd.info = rq;
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_rq_hrtick(struct rq *rq)
{
}
static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
#ifdef CONFIG_SMP
#ifndef tsk_is_polling
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif
static void resched_task(struct task_struct *p)
{
int cpu;
assert_raw_spin_locked(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
return;
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
static void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
return;
resched_task(cpu_curr(cpu));
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#ifdef CONFIG_NO_HZ
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
int cpu = smp_processor_id();
int i;
struct sched_domain *sd;
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd))
if (!idle_cpu(i))
return i;
}
return cpu;
}
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
* which is scheduled to wake up that CPU. In case of a completely
* idle system the next event might even be infinite time into the
* future. wake_up_idle_cpu() ensures that the CPU is woken up and
* leaves the inner idle loop so the newly added timer is taken into
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id())
return;
/*
* This is safe, as this function is called with the timer
* wheel base lock of (cpu) held. When the CPU is on the way
* to idle and has not yet set rq->curr to idle then it will
* be serialized on the timer wheel base lock and take the new
* timer into account automatically.
*/
if (rq->curr != rq->idle)
return;
/*
* We can set TIF_RESCHED on the idle task of the other CPU
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
static void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
while ((s64)(rq->clock - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
* See __iter_div_u64_rem() for another example of this.
*/
asm("" : "+rm" (rq->age_stamp));
rq->age_stamp += period;
rq->rt_avg /= 2;
}
}
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta;
sched_avg_update(rq);
}
#else /* !CONFIG_SMP */
static void resched_task(struct task_struct *p)
{
assert_raw_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
static void sched_avg_update(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
# define WMULT_CONST (1UL << 32)
#endif
#define WMULT_SHIFT 32
/*
* Shift right and round:
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
/*
* delta *= weight / lw
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;
if (!lw->inv_weight) {
if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
lw->inv_weight = 1;
else
lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
/ (lw->weight+1);
}
tmp = (u64)delta_exec * weight;
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
if (unlikely(tmp > WMULT_CONST))
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
WMULT_SHIFT/2);
else
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
lw->inv_weight = 0;
}
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
* each task makes to its run queue's load is weighted according to its
* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
* scaled version of the new time slice allocation that they receive on time
* slice expiry etc.
*/
#define WEIGHT_IDLEPRIO 3
#define WMULT_IDLEPRIO 1431655765
/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
* nice 1, it will get ~10% less CPU time than another CPU-bound task
* that remained on nice 0.
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
};
/*
* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
*
* In cases where the weight does not change often, we can use the
* precalculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
static const u32 prio_to_wmult[40] = {
/* -20 */ 48388, 59856, 76040, 92818, 118348,
/* -15 */ 147320, 184698, 229616, 287308, 360437,
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
CPUACCT_STAT_NSTATS,
};
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val) {}
#endif
static inline void inc_cpu_load(struct rq *rq, unsigned long load)
{
update_load_add(&rq->load, load);
}
static inline void dec_cpu_load(struct rq *rq, unsigned long load)
{
update_load_sub(&rq->load, load);
}
#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
rcu_read_lock();
parent = &root_task_group;
down:
ret = (*down)(parent, data);
if (ret)
goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
ret = (*up)(parent, data);
if (ret)
goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
out_unlock:
rcu_read_unlock();
return ret;
}
static int tg_nop(struct task_group *tg, void *data)
{
return 0;
}
#endif
#ifdef CONFIG_SMP
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
|