/* * kernel/sched_pfair.c * * Implementation of the PD^2 pfair scheduling algorithm. This * implementation realizes "early releasing," i.e., it is work-conserving. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* to configure the cluster size */ #include #include static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER; struct subtask { /* measured in quanta relative to job release */ quanta_t release; quanta_t deadline; quanta_t overlap; /* called "b bit" by PD^2 */ quanta_t group_deadline; }; struct pfair_param { quanta_t quanta; /* number of subtasks */ quanta_t cur; /* index of current subtask */ quanta_t release; /* in quanta */ quanta_t period; /* in quanta */ quanta_t last_quantum; /* when scheduled last */ int last_cpu; /* where scheduled last */ unsigned int needs_requeue:1; struct pfair_cluster* cluster; /* where this task is scheduled */ struct subtask subtasks[0]; /* allocate together with pfair_param */ }; #define tsk_pfair(tsk) ((tsk)->rt_param.pfair) struct pfair_state { struct cluster_cpu topology; struct hrtimer quantum_timer; volatile quanta_t cur_tick; /* updated by the CPU that is advancing * the time */ volatile quanta_t local_tick; /* What tick is the local CPU currently * executing? Updated only by the local * CPU. In QEMU, this may lag behind the * current tick. In a real system, with * proper timers and aligned quanta, * that should only be the case for a * very short time after the time * advanced. With staggered quanta, it * will lag for the duration of the * offset. */ struct task_struct* linked; /* the task that should be executing */ struct task_struct* local; /* the local copy of linked */ struct task_struct* scheduled; /* what is actually scheduled */ struct list_head out_of_budget; /* list of tasks that exhausted their allocation */ lt_t offset; /* stagger offset */ unsigned int missed_updates; unsigned int missed_quanta; }; struct pfair_cluster { struct scheduling_cluster topology; /* The "global" time in this cluster. */ quanta_t pfair_time; /* the "official" PFAIR clock */ /* The ready queue for this cluster. */ rt_domain_t pfair; /* The set of jobs that should have their release enacted at the next * quantum boundary. */ struct bheap release_queue; raw_spinlock_t release_lock; }; static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state) { return container_of(state->topology.cluster, struct pfair_cluster, topology); } static inline int cpu_id(struct pfair_state* state) { return state->topology.id; } static inline struct pfair_state* from_cluster_list(struct list_head* pos) { return list_entry(pos, struct pfair_state, topology.cluster_list); } static inline struct pfair_cluster* from_domain(rt_domain_t* rt) { return container_of(rt, struct pfair_cluster, pfair); } static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster) { /* The ready_lock is used to serialize all scheduling events. */ return &cluster->pfair.ready_lock; } static inline raw_spinlock_t* cpu_lock(struct pfair_state* state) { return cluster_lock(cpu_cluster(state)); } DEFINE_PER_CPU(struct pfair_state, pfair_state); struct pfair_state* *pstate; /* short cut */ static struct pfair_cluster* pfair_clusters; static int num_pfair_clusters; /* Enable for lots of trace info. * #define PFAIR_DEBUG */ #ifdef PFAIR_DEBUG #define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args) #define PTRACE(f, args...) TRACE(f, ## args) #else #define PTRACE_TASK(t, f, args...) #define PTRACE(f, args...) #endif /* gcc will inline all of these accessor functions... */ static struct subtask* cur_subtask(struct task_struct* t) { return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur; } static quanta_t cur_deadline(struct task_struct* t) { return cur_subtask(t)->deadline + tsk_pfair(t)->release; } static quanta_t cur_release(struct task_struct* t) { /* This is early releasing: only the release of the first subtask * counts. */ return tsk_pfair(t)->release; } static quanta_t cur_overlap(struct task_struct* t) { return cur_subtask(t)->overlap; } static quanta_t cur_group_deadline(struct task_struct* t) { quanta_t gdl = cur_subtask(t)->group_deadline; if (gdl) return gdl + tsk_pfair(t)->release; else return gdl; } static int pfair_higher_prio(struct task_struct* first, struct task_struct* second) { return /* first task must exist */ first && ( /* Does the second task exist and is it a real-time task? If * not, the first task (which is a RT task) has higher * priority. */ !second || !is_realtime(second) || /* Is the (subtask) deadline of the first task earlier? * Then it has higher priority. */ time_before(cur_deadline(first), cur_deadline(second)) || /* Do we have a deadline tie? * Then break by B-bit. */ (cur_deadline(first) == cur_deadline(second) && (cur_overlap(first) > cur_overlap(second) || /* Do we have a B-bit tie? * Then break by group deadline. */ (cur_overlap(first) == cur_overlap(second) && (time_after(cur_group_deadline(first), cur_group_deadline(second)) || /* Do we have a group deadline tie? * Then break by PID, which are unique. */ (cur_group_deadline(first) == cur_group_deadline(second) && first->pid < second->pid)))))); } int pfair_ready_order(struct bheap_node* a, struct bheap_node* b) { return pfair_higher_prio(bheap2task(a), bheap2task(b)); } static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks) { struct pfair_cluster* cluster = from_domain(rt); unsigned long flags; raw_spin_lock_irqsave(&cluster->release_lock, flags); bheap_union(pfair_ready_order, &cluster->release_queue, tasks); raw_spin_unlock_irqrestore(&cluster->release_lock, flags); } static void prepare_release(struct task_struct* t, quanta_t at) { tsk_pfair(t)->release = at; tsk_pfair(t)->cur = 0; } /* pull released tasks from the release queue */ static void poll_releases(struct pfair_cluster* cluster) { raw_spin_lock(&cluster->release_lock); __merge_ready(&cluster->pfair, &cluster->release_queue); raw_spin_unlock(&cluster->release_lock); } static void check_preempt(struct task_struct* t) { int cpu = NO_CPU; if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on && is_present(t)) { /* the task can be scheduled and * is not scheduled where it ought to be scheduled */ cpu = tsk_rt(t)->linked_on != NO_CPU ? tsk_rt(t)->linked_on : tsk_rt(t)->scheduled_on; PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n", tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on); /* preempt */ litmus_reschedule(cpu); } } /* caller must hold pfair.ready_lock */ static void drop_all_references(struct task_struct *t) { int cpu; struct pfair_state* s; struct pfair_cluster* cluster; if (bheap_node_in_heap(tsk_rt(t)->heap_node)) { /* It must be in the ready queue; drop references isn't called * when the job is in a release queue. */ cluster = tsk_pfair(t)->cluster; bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue, tsk_rt(t)->heap_node); } for (cpu = 0; cpu < num_online_cpus(); cpu++) { s = &per_cpu(pfair_state, cpu); if (s->linked == t) s->linked = NULL; if (s->local == t) s->local = NULL; if (s->scheduled == t) s->scheduled = NULL; } /* make sure we don't have a stale linked_on field */ tsk_rt(t)->linked_on = NO_CPU; /* make sure we're not queued for re-releasing */ if (in_list(&tsk_rt(t)->list)) { TRACE_TASK(t, "removing from out_of_budget queue\n"); list_del(&tsk_rt(t)->list); } } static void pfair_prepare_next_period(struct task_struct* t) { struct pfair_param* p = tsk_pfair(t); prepare_for_next_period(t); tsk_rt(t)->completed = 0; p->release = time2quanta(get_release(t), CEIL); } /* returns 1 if the task needs to go the release queue */ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu) { struct pfair_param* p = tsk_pfair(t); int to_relq; p->cur = (p->cur + 1) % p->quanta; if (!p->cur) { if (is_present(t)) { /* The job overran; we start a new budget allocation. */ TRACE_TASK(t, "overran budget, preparing next period\n"); sched_trace_task_completion(t, 1); pfair_prepare_next_period(t); } else { /* remove task from system until it wakes */ drop_all_references(t); p->needs_requeue = 1; TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n", cpu, p->cur); return 0; } } to_relq = time_after(cur_release(t), time); TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d " "(cur_release:%lu time:%lu present:%d on_cpu=%d)\n", cpu, p->cur, to_relq, cur_release(t), time, tsk_rt(t)->present, tsk_rt(t)->scheduled_on); return to_relq; } static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time) { struct task_struct* l; struct pfair_param* p; struct list_head* pos; struct pfair_state* cpu; list_for_each(pos, &cluster->topology.cpus) { cpu = from_cluster_list(pos); l = cpu->linked; cpu->missed_updates += cpu->linked != cpu->local; if (l) { p = tsk_pfair(l); p->last_quantum = time; p->last_cpu = cpu_id(cpu); if (advance_subtask(time, l, cpu_id(cpu))) { cpu->linked = NULL; tsk_rt(l)->linked_on = NO_CPU; PTRACE_TASK(l, "should go to release queue. " "scheduled_on=%d present=%d\n", tsk_rt(l)->scheduled_on, tsk_rt(l)->present); list_add(&tsk_rt(l)->list, &cpu->out_of_budget); } } } } static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu) { int cpu; if (tsk_rt(t)->scheduled_on != NO_CPU) { /* always observe scheduled_on linkage */ default_cpu = tsk_rt(t)->scheduled_on; } else if (tsk_pfair(t)->last_quantum == time - 1) { /* back2back quanta */ /* Only observe last_quantum if no scheduled_on is in the way. * This should only kick in if a CPU missed quanta, and that * *should* only happen in QEMU. */ cpu = tsk_pfair(t)->last_cpu; if (!pstate[cpu]->linked || tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) { default_cpu = cpu; } } return default_cpu; } /* returns one if linking was redirected */ static int pfair_link(quanta_t time, int cpu, struct task_struct* t) { int target = target_cpu(time, t, cpu); struct task_struct* prev = pstate[cpu]->linked; struct task_struct* other; struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]); if (target != cpu) { BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster); other = pstate[target]->linked; pstate[target]->linked = t; tsk_rt(t)->linked_on = target; if (!other) /* linked ok, but reschedule this CPU */ return 1; if (target < cpu) { /* link other to cpu instead */ tsk_rt(other)->linked_on = cpu; pstate[cpu]->linked = other; if (prev) { /* prev got pushed back into the ready queue */ tsk_rt(prev)->linked_on = NO_CPU; __add_ready(&cluster->pfair, prev); } /* we are done with this cpu */ return 0; } else { /* re-add other, it's original CPU was not considered yet */ tsk_rt(other)->linked_on = NO_CPU; __add_ready(&cluster->pfair, other); /* reschedule this CPU */ return 1; } } else { pstate[cpu]->linked = t; tsk_rt(t)->linked_on = cpu; if (prev) { /* prev got pushed back into the ready queue */ tsk_rt(prev)->linked_on = NO_CPU; __add_ready(&cluster->pfair, prev); } /* we are done with this CPU */ return 0; } } static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time) { int retry; struct list_head *pos; struct pfair_state *cpu_state; list_for_each(pos, &cluster->topology.cpus) { cpu_state = from_cluster_list(pos); retry = 1; #ifdef CONFIG_RELEASE_MASTER /* skip release master */ if (cluster->pfair.release_master == cpu_id(cpu_state)) continue; #endif while (retry) { if (pfair_higher_prio(__peek_ready(&cluster->pfair), cpu_state->linked)) retry = pfair_link(time, cpu_id(cpu_state), __take_ready(&cluster->pfair)); else retry = 0; } } } static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time) { struct pfair_state *cpu; struct list_head* pos; /* called with interrupts disabled */ PTRACE("--- Q %lu at %llu PRE-SPIN\n", time, litmus_clock()); raw_spin_lock(cluster_lock(cluster)); PTRACE("<<< Q %lu at %llu\n", time, litmus_clock()); sched_trace_quantum_boundary(); advance_subtasks(cluster, time); poll_releases(cluster); schedule_subtasks(cluster, time); list_for_each(pos, &cluster->topology.cpus) { cpu = from_cluster_list(pos); if (cpu->linked) PTRACE_TASK(cpu->linked, " linked on %d.\n", cpu_id(cpu)); else PTRACE("(null) linked on %d.\n", cpu_id(cpu)); } /* We are done. Advance time. */ mb(); list_for_each(pos, &cluster->topology.cpus) { cpu = from_cluster_list(pos); if (cpu->local_tick != cpu->cur_tick) { TRACE("BAD Quantum not acked on %d " "(l:%lu c:%lu p:%lu)\n", cpu_id(cpu), cpu->local_tick, cpu->cur_tick, cluster->pfair_time); cpu->missed_quanta++; } cpu->cur_tick = time; } PTRACE(">>> Q %lu at %llu\n", time, litmus_clock()); raw_spin_unlock(cluster_lock(cluster)); } static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state) { quanta_t loc; goto first; /* skip mb() on first iteration */ do { cpu_relax(); mb(); first: loc = state->cur_tick; /* FIXME: what if loc > cur? */ } while (time_before(loc, q)); PTRACE("observed cur_tick:%lu >= q:%lu\n", loc, q); } static quanta_t current_quantum(struct pfair_state* state) { lt_t t = litmus_clock() - state->offset; return time2quanta(t, FLOOR); } static void catchup_quanta(quanta_t from, quanta_t target, struct pfair_state* state) { quanta_t cur = from, time; TRACE("+++< BAD catching up quanta from %lu to %lu\n", from, target); while (time_before(cur, target)) { wait_for_quantum(cur, state); cur++; time = cmpxchg(&cpu_cluster(state)->pfair_time, cur - 1, /* expected */ cur /* next */ ); if (time == cur - 1) schedule_next_quantum(cpu_cluster(state), cur); } TRACE("+++> catching up done\n"); } /* pfair_tick - this function is called for every local timer * interrupt. */ static void pfair_tick(struct task_struct* t) { struct pfair_state* state = this_cpu_ptr(&pfair_state); quanta_t time, cur; int retry = 10; do { cur = current_quantum(state); PTRACE("q %lu at %llu\n", cur, litmus_clock()); /* Attempt to advance time. First CPU to get here * will prepare the next quantum. */ time = cpu_cluster(state)->pfair_time; if (time == cur - 1) { /* looks good, see if we can advance the time */ time = cmpxchg(&cpu_cluster(state)->pfair_time, cur - 1, /* expected */ cur /* next */ ); } if (time == cur - 1) { /* exchange succeeded */ wait_for_quantum(cur - 1, state); schedule_next_quantum(cpu_cluster(state), cur); retry = 0; } else if (time_before(time, cur - 1)) { /* the whole system missed a tick !? */ catchup_quanta(time, cur, state); retry--; } else if (time_after(time, cur)) { /* our timer lagging behind!? */ TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur); retry--; } else { /* Some other CPU already started scheduling * this quantum. Let it do its job and then update. */ retry = 0; } } while (retry); /* Spin locally until time advances. */ wait_for_quantum(cur, state); /* copy assignment */ /* FIXME: what if we race with a future update? Corrupted state? */ state->local = state->linked; /* signal that we are done */ mb(); state->local_tick = state->cur_tick; if (state->local != current && (is_realtime(current) || is_present(state->local))) litmus_reschedule_local(); } static void process_out_of_budget_tasks( struct pfair_state* state, struct task_struct* prev, unsigned int blocks) { struct task_struct *t; while (!list_empty(&state->out_of_budget)) { t = list_first_entry(&state->out_of_budget, struct task_struct, rt_param.list); TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev); list_del(&tsk_rt(t)->list); if (t != prev || !blocks) { sched_trace_task_release(t); add_release(&cpu_cluster(state)->pfair, t); TRACE_TASK(t, "adding to release queue (budget exhausted)\n"); } else { TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks); tsk_pfair(t)->needs_requeue = 1; } if (unlikely(state->local == t)) { TRACE_TASK(t, "still linked as ->local, cleaning up\n"); state->local = NULL; } } } /* Custom scheduling tick: called on each quantum boundary. */ static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer) { TS_QUANTUM_BOUNDARY_START; pfair_tick(current); hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS); TS_QUANTUM_BOUNDARY_END; return HRTIMER_RESTART; } static int safe_to_schedule(struct task_struct* t, int cpu) { int where = tsk_rt(t)->scheduled_on; if (where != NO_CPU && where != cpu) { TRACE_TASK(t, "BAD: can't be scheduled on %d, " "scheduled already on %d.\n", cpu, where); return 0; } else return is_present(t) && !is_completed(t); } static struct task_struct* pfair_schedule(struct task_struct * prev) { struct pfair_state* state = this_cpu_ptr(&pfair_state); struct pfair_cluster* cluster = cpu_cluster(state); int blocks, completion, out_of_time; struct task_struct* next = NULL; #ifdef CONFIG_RELEASE_MASTER /* Bail out early if we are the release master. * The release master never schedules any real-time tasks. */ if (unlikely(cluster->pfair.release_master == cpu_id(state))) { goto out; } #endif raw_spin_lock(cpu_lock(state)); blocks = is_realtime(prev) && !is_current_running(); completion = is_realtime(prev) && is_completed(prev); out_of_time = is_realtime(prev) && time_after(cur_release(prev), state->local_tick); if (is_realtime(prev)) PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n", blocks, completion, out_of_time); if (completion && !out_of_time) { sched_trace_task_completion(prev, 0); pfair_prepare_next_period(prev); prepare_release(prev, cur_release(prev)); drop_all_references(prev); list_add(&tsk_rt(prev)->list, &state->out_of_budget); } process_out_of_budget_tasks(state, prev, blocks); if (state->local && safe_to_schedule(state->local, cpu_id(state))) next = state->local; if (prev != next) { tsk_rt(prev)->scheduled_on = NO_CPU; if (next) tsk_rt(next)->scheduled_on = cpu_id(state); } sched_state_task_picked(); raw_spin_unlock(cpu_lock(state)); if (next) TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n", tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock()); else if (is_realtime(prev)) TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock()); #ifdef CONFIG_RELEASE_MASTER out: #endif if (unlikely(!hrtimer_active(&state->quantum_timer))) { TRACE("activating quantum timer start=%llu\n", hrtimer_get_expires(&state->quantum_timer)); __hrtimer_start_range_ns(&state->quantum_timer, hrtimer_get_expires(&state->quantum_timer), 0, HRTIMER_MODE_ABS, 0); } return next; } static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled) { unsigned long flags; struct pfair_cluster* cluster; TRACE("pfair: task new %d state:%d\n", t->pid, t->state); cluster = tsk_pfair(t)->cluster; raw_spin_lock_irqsave(cluster_lock(cluster), flags); prepare_release(t, cluster->pfair_time + 1); release_at(t, quanta2time(cur_release(t))); t->rt_param.scheduled_on = NO_CPU; t->rt_param.linked_on = NO_CPU; if (is_scheduled) { #ifdef CONFIG_RELEASE_MASTER if (task_cpu(t) != cluster->pfair.release_master) #endif t->rt_param.scheduled_on = task_cpu(t); } if (on_rq || is_scheduled) { tsk_rt(t)->present = 1; __add_ready(&cluster->pfair, t); } else { tsk_rt(t)->present = 0; tsk_pfair(t)->needs_requeue = 1; } check_preempt(t); raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); } static void pfair_task_wake_up(struct task_struct *t) { unsigned long flags; lt_t now; struct pfair_cluster* cluster; struct pfair_state* state; int sporadic_release = 0; cluster = tsk_pfair(t)->cluster; TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n", litmus_clock(), cur_release(t), cluster->pfair_time); raw_spin_lock_irqsave(cluster_lock(cluster), flags); state = this_cpu_ptr(&pfair_state); /* If a task blocks and wakes before its next job release, * then it may resume if it is currently linked somewhere * (as if it never blocked at all). Otherwise, we have a * new sporadic job release. */ now = litmus_clock(); if (is_tardy(t, now)) { TRACE_TASK(t, "sporadic release!\n"); sporadic_release = 1; release_at(t, now); prepare_release(t, time2quanta(now, CEIL)); sched_trace_task_release(t); } /* only add to ready queue if the task isn't still linked somewhere */ if (tsk_pfair(t)->needs_requeue) { tsk_pfair(t)->needs_requeue = 0; TRACE_TASK(t, "requeueing required (released:%d)\n", !time_after(cur_release(t), state->local_tick)); tsk_rt(t)->completed = 0; if (time_after(cur_release(t), state->local_tick) && !sporadic_release) add_release(&cluster->pfair, t); else __add_ready(&cluster->pfair, t); } check_preempt(t); raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); TRACE_TASK(t, "wake up done at %llu\n", litmus_clock()); } static void pfair_task_block(struct task_struct *t) { BUG_ON(!is_realtime(t)); TRACE_TASK(t, "blocks at %llu, state:%d\n", litmus_clock(), t->state); } static void pfair_task_exit(struct task_struct * t) { unsigned long flags; struct pfair_cluster *cluster; BUG_ON(!is_realtime(t)); cluster = tsk_pfair(t)->cluster; /* Remote task from release or ready queue, and ensure * that it is not the scheduled task for ANY CPU. We * do this blanket check because occassionally when * tasks exit while blocked, the task_cpu of the task * might not be the same as the CPU that the PFAIR scheduler * has chosen for it. */ raw_spin_lock_irqsave(cluster_lock(cluster), flags); TRACE_TASK(t, "RIP, state:%d\n", t->state); drop_all_references(t); raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); kfree(t->rt_param.pfair); t->rt_param.pfair = NULL; } static void init_subtask(struct subtask* sub, unsigned long i, lt_t quanta, lt_t period) { /* since i is zero-based, the formulas are shifted by one */ lt_t tmp; /* release */ tmp = period * i; do_div(tmp, quanta); /* floor */ sub->release = (quanta_t) tmp; /* deadline */ tmp = period * (i + 1); if (do_div(tmp, quanta)) /* ceil */ tmp++; sub->deadline = (quanta_t) tmp; /* next release */ tmp = period * (i + 1); do_div(tmp, quanta); /* floor */ sub->overlap = sub->deadline - (quanta_t) tmp; /* Group deadline. * Based on the formula given in Uma's thesis. */ if (2 * quanta >= period) { /* heavy */ tmp = (sub->deadline - (i + 1)) * period; if (period > quanta && do_div(tmp, (period - quanta))) /* ceil */ tmp++; sub->group_deadline = (quanta_t) tmp; } else sub->group_deadline = 0; } static void dump_subtasks(struct task_struct* t) { unsigned long i; for (i = 0; i < t->rt_param.pfair->quanta; i++) TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n", i + 1, t->rt_param.pfair->subtasks[i].release, t->rt_param.pfair->subtasks[i].deadline, t->rt_param.pfair->subtasks[i].overlap, t->rt_param.pfair->subtasks[i].group_deadline); } static long pfair_admit_task(struct task_struct* t) { lt_t quanta; lt_t period; s64 quantum_length = LITMUS_QUANTUM_LENGTH_NS; struct pfair_param* param; unsigned long i; /* first check that the task is in the right cluster */ if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) != cpu_cluster(pstate[task_cpu(t)])) return -EINVAL; if (get_rt_period(t) != get_rt_relative_deadline(t)) { printk(KERN_INFO "%s: Admission rejected. " "Only implicit deadlines are currently supported.\n", litmus->plugin_name); return -EINVAL; } /* Pfair is a tick-based scheduler, so the unit of time * is one quantum. Calculate quantum-based parameters for everything. * (Ceiling of exec cost, floor of period.) */ quanta = get_exec_cost(t); period = get_rt_period(t); quanta = time2quanta(get_exec_cost(t), CEIL); if (do_div(period, quantum_length)) printk(KERN_WARNING "The period of %s/%d is not a multiple of %llu.\n", t->comm, t->pid, (unsigned long long) quantum_length); if (quanta == period) { PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period); } param = kzalloc(sizeof(*param) + quanta * sizeof(struct subtask), GFP_ATOMIC); if (!param) return -ENOMEM; param->quanta = quanta; param->period = period; param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]); for (i = 0; i < quanta; i++) init_subtask(param->subtasks + i, i, quanta, period); if (t->rt_param.pfair) /* get rid of stale allocation */ kfree(t->rt_param.pfair); t->rt_param.pfair = param; /* spew out some debug info */ dump_subtasks(t); /* Disable generic budget enforcement (if enabled). * The plugin provides its own (non-optional) enforcement * of allocations at quantum granularity. */ tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT; return 0; } static void pfair_init_cluster(struct pfair_cluster* cluster) { rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs); bheap_init(&cluster->release_queue); raw_spin_lock_init(&cluster->release_lock); INIT_LIST_HEAD(&cluster->topology.cpus); } static void cleanup_clusters(void) { int i; if (num_pfair_clusters) kfree(pfair_clusters); pfair_clusters = NULL; num_pfair_clusters = 0; /* avoid stale pointers */ for (i = 0; i < num_online_cpus(); i++) { pstate[i]->topology.cluster = NULL; printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]), pstate[i]->missed_updates, pstate[i]->missed_quanta); } } static struct domain_proc_info pfair_domain_proc_info; static long pfair_get_domain_proc_info(struct domain_proc_info **ret) { *ret = &pfair_domain_proc_info; return 0; } static void pfair_setup_domain_proc(void) { int i, cpu, domain; #ifdef CONFIG_RELEASE_MASTER int release_master = atomic_read(&release_master_cpu); /* skip over the domain with the release master if cluster size is 1 */ int cluster_size = num_online_cpus() / num_pfair_clusters; int skip_domain = (1 == cluster_size && release_master != NO_CPU) ? release_master : NO_CPU; #else int release_master = NO_CPU; int skip_domain = NO_CPU; #endif int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU); int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU); struct cd_mapping *map; memset(&pfair_domain_proc_info, sizeof(pfair_domain_proc_info), 0); init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters); pfair_domain_proc_info.num_cpus = num_rt_cpus; pfair_domain_proc_info.num_domains = num_rt_domains; for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) { if (cpu == release_master) continue; map = &pfair_domain_proc_info.cpu_to_domains[i]; /* pointer math to figure out the domain index */ domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters; map->id = cpu; cpumask_set_cpu(domain, map->mask); ++i; } for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) { struct pfair_cluster *cluster; struct list_head *pos; if (domain == skip_domain) continue; cluster = &pfair_clusters[domain]; map = &pfair_domain_proc_info.domain_to_cpus[i]; map->id = i; list_for_each(pos, &cluster->topology.cpus) { cpu = cpu_id(from_cluster_list(pos)); if (cpu != release_master) cpumask_set_cpu(cpu, map->mask); } ++i; } } static long pfair_activate_plugin(void) { int err, i; struct pfair_state* state; struct pfair_cluster* cluster; quanta_t now, start; int cluster_size; struct cluster_cpu* cpus[NR_CPUS]; struct scheduling_cluster* clust[NR_CPUS]; lt_t quantum_timer_start; cluster_size = get_cluster_size(pfair_cluster_level); if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0) return -EINVAL; num_pfair_clusters = num_online_cpus() / cluster_size; pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC); if (!pfair_clusters) { num_pfair_clusters = 0; printk(KERN_ERR "Could not allocate Pfair clusters!\n"); return -ENOMEM; } state = this_cpu_ptr(&pfair_state); now = current_quantum(state); start = now + 50; quantum_timer_start = quanta2time(start); TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n", litmus_clock(), now, quantum_timer_start, time2quanta(quantum_timer_start, CEIL)); for (i = 0; i < num_pfair_clusters; i++) { cluster = &pfair_clusters[i]; pfair_init_cluster(cluster); cluster->pfair_time = start; clust[i] = &cluster->topology; #ifdef CONFIG_RELEASE_MASTER cluster->pfair.release_master = atomic_read(&release_master_cpu); #endif } for_each_online_cpu(i) { state = &per_cpu(pfair_state, i); state->cur_tick = start; state->local_tick = start; state->missed_quanta = 0; state->missed_updates = 0; state->offset = cpu_stagger_offset(i); hrtimer_set_expires(&state->quantum_timer, ns_to_ktime(quantum_timer_start + state->offset)); cpus[i] = &state->topology; TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus()); INIT_LIST_HEAD(&state->out_of_budget); /* force rescheduling to start quantum timer */ litmus_reschedule(i); WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer), KERN_ERR "WARNING: no high resolution timers available!?\n"); } err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters, cpus, num_online_cpus()); if (err < 0) cleanup_clusters(); else pfair_setup_domain_proc(); return err; } static long pfair_deactivate_plugin(void) { int cpu; struct pfair_state* state; for_each_online_cpu(cpu) { state = &per_cpu(pfair_state, cpu); TRACE("stopping quantum timer on CPU%d\n", cpu); hrtimer_cancel(&state->quantum_timer); } cleanup_clusters(); destroy_domain_proc_info(&pfair_domain_proc_info); return 0; } /* Plugin object */ static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = { .plugin_name = "PFAIR", .task_new = pfair_task_new, .task_exit = pfair_task_exit, .schedule = pfair_schedule, .task_wake_up = pfair_task_wake_up, .task_block = pfair_task_block, .admit_task = pfair_admit_task, .complete_job = complete_job, .activate_plugin = pfair_activate_plugin, .deactivate_plugin = pfair_deactivate_plugin, .get_domain_proc_info = pfair_get_domain_proc_info, }; static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL; static int __init init_pfair(void) { int cpu, err, fs; struct pfair_state *state; /* * initialize short_cut for per-cpu pfair state; * there may be a problem here if someone removes a cpu * while we are doing this initialization... and if cpus * are added / removed later... but we don't support CPU hotplug atm anyway. */ pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL); /* initialize CPU state */ for (cpu = 0; cpu < num_online_cpus(); cpu++) { state = &per_cpu(pfair_state, cpu); hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); state->quantum_timer.function = on_quantum_boundary; state->topology.id = cpu; state->cur_tick = 0; state->local_tick = 0; state->linked = NULL; state->local = NULL; state->scheduled = NULL; state->missed_quanta = 0; state->offset = cpu_stagger_offset(cpu); pstate[cpu] = state; } pfair_clusters = NULL; num_pfair_clusters = 0; err = register_sched_plugin(&pfair_plugin); if (!err) { fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir); if (!fs) cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level); else printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n"); } return err; } static void __exit clean_pfair(void) { kfree(pstate); if (cluster_file) remove_proc_entry("cluster", pfair_dir); if (pfair_dir) remove_plugin_proc_dir(&pfair_plugin); } module_init(init_pfair); module_exit(clean_pfair);