/* * kernel/sched_pfair.c * * Implementation of the (global) Pfair scheduling algorithm. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* Tick period is used to convert ns-specified execution * costs and periods into tick-based equivalents. */ extern ktime_t tick_period; /* make the unit explicit */ typedef unsigned long quanta_t; struct subtask { /* measured in quanta relative to job release */ quanta_t release; quanta_t deadline; quanta_t overlap; /* called "b bit" by PD^2 */ quanta_t group_deadline; }; struct pfair_param { quanta_t quanta; /* number of subtasks */ quanta_t cur; /* index of current subtask */ quanta_t release; /* in quanta */ quanta_t period; /* in quanta */ quanta_t last_quantum; /* when scheduled last */ int last_cpu; /* where scheduled last */ unsigned int present; /* Can the task be scheduled? */ struct subtask subtasks[0]; /* allocate together with pfair_param */ }; #define tsk_pfair(tsk) ((tsk)->rt_param.pfair) struct pfair_state { int cpu; volatile quanta_t cur_tick; /* updated by the CPU that is advancing * the time */ volatile quanta_t local_tick; /* What tick is the local CPU currently * executing? Updated only by the local * CPU. In QEMU, this may lag behind the * current tick. In a real system, with * proper timers and aligned quanta, * that should only be the * case for a very short time after the * time advanced. With staggered quanta, * it will lag for the duration of the * offset. */ struct task_struct* linked; /* the task that should be executing */ struct task_struct* local; /* the local copy of linked */ struct task_struct* scheduled; /* what is actually scheduled */ unsigned long missed_quanta; }; /* Currently, we limit the maximum period of any task to 1000 quanta. * The reason is that it makes the implementation easier since we do not * need to reallocate the release wheel on task arrivals. * In the future */ #define PFAIR_MAX_PERIOD 1000 /* This is the release queue wheel. It is indexed by pfair_time % * PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR priority, so that it can be * merged with the ready queue. */ static struct heap release_queue[PFAIR_MAX_PERIOD]; DEFINE_PER_CPU(struct pfair_state, pfair_state); struct pfair_state* pstate[NR_CPUS]; /* short cut */ #define NO_CPU 0xffffffff static quanta_t pfair_time = 0; /* the "official" PFAIR clock */ static quanta_t merge_time = 0; /* Updated after the release queue has been * merged. Used by drop_all_references(). */ static rt_domain_t pfair; /* The pfair_lock is used to serialize all scheduling events. */ #define pfair_lock pfair.ready_lock /* Enable for lots of trace info. * #define PFAIR_DEBUG */ #ifdef PFAIR_DEBUG #define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, # args) #define PTRACE(f, args...) TRACE(f, # args) #else #define PTRACE_TASK(t, f, args...) #define PTRACE(f, args...) #endif /* gcc will inline all of these accessor functions... */ static struct subtask* cur_subtask(struct task_struct* t) { return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur; } static quanta_t cur_deadline(struct task_struct* t) { return cur_subtask(t)->deadline + tsk_pfair(t)->release; } static quanta_t cur_release(struct task_struct* t) { #ifdef EARLY_RELEASE /* only the release of the first subtask counts when we early * release */ return tsk_pfair(t)->release; #else return cur_subtask(t)->release + tsk_pfair(t)->release; #endif } static quanta_t cur_sub_release(struct task_struct* t) { return cur_subtask(t)->release + tsk_pfair(t)->release; } static quanta_t cur_overlap(struct task_struct* t) { return cur_subtask(t)->overlap; } static quanta_t cur_group_deadline(struct task_struct* t) { quanta_t gdl = cur_subtask(t)->group_deadline; if (gdl) return gdl + tsk_pfair(t)->release; else return gdl; } enum round { FLOOR, CEIL }; static quanta_t time2quanta(lt_t time, enum round round) { s64 quantum_length = ktime_to_ns(tick_period); if (do_div(time, quantum_length) && round == CEIL) time++; return (quanta_t) time; } static int pfair_higher_prio(struct task_struct* first, struct task_struct* second) { return /* first task must exist */ first && ( /* Does the second task exist and is it a real-time task? If * not, the first task (which is a RT task) has higher * priority. */ !second || !is_realtime(second) || /* Is the (subtask) deadline of the first task earlier? * Then it has higher priority. */ time_before(cur_deadline(first), cur_deadline(second)) || /* Do we have a deadline tie? * Then break by B-bit. */ (cur_deadline(first) == cur_deadline(second) && cur_overlap(first) > cur_overlap(second)) || /* Do we have a B-bit tie? * Then break by group deadline. */ (cur_overlap(first) == cur_overlap(second) && time_after(cur_group_deadline(first), cur_group_deadline(second))) || /* Do we have a group deadline tie? * Then break by PID, which are unique. */ (cur_group_deadline(first) == cur_group_deadline(second) && first->pid < second->pid)); } int pfair_ready_order(struct heap_node* a, struct heap_node* b) { return pfair_higher_prio(heap2task(a), heap2task(b)); } /* return the proper release queue for time t */ static struct heap* relq(quanta_t t) { struct heap* rq = &release_queue[t % PFAIR_MAX_PERIOD]; return rq; } static void prepare_release(struct task_struct* t, quanta_t at) { tsk_pfair(t)->release = at; tsk_pfair(t)->cur = 0; } static void __pfair_add_release(struct task_struct* t, struct heap* queue) { heap_insert(pfair_ready_order, queue, tsk_rt(t)->heap_node); } static void pfair_add_release(struct task_struct* t) { BUG_ON(heap_node_in_heap(tsk_rt(t)->heap_node)); __pfair_add_release(t, relq(cur_release(t))); } /* pull released tasks from the release queue */ static void poll_releases(quanta_t time) { heap_union(pfair_ready_order, &pfair.ready_queue, relq(time)); merge_time = time; } static void check_preempt(struct task_struct* t) { int cpu = NO_CPU; if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on && tsk_pfair(t)->present) { /* the task can be scheduled and * is not scheduled where it ought to be scheduled */ cpu = tsk_rt(t)->linked_on != NO_CPU ? tsk_rt(t)->linked_on : tsk_rt(t)->scheduled_on; PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n", tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on); /* preempt */ if (cpu == smp_processor_id()) set_tsk_need_resched(current); else { smp_send_reschedule(cpu); } } } /* returns 1 if the task needs to go the release queue */ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu) { struct pfair_param* p = tsk_pfair(t); p->cur = (p->cur + 1) % p->quanta; TRACE_TASK(t, "on %d advanced to subtask %lu\n", cpu, p->cur); if (!p->cur) { /* we start a new job */ get_rt_flags(t) = RT_F_RUNNING; prepare_for_next_period(t); p->release += p->period; } return time_after(cur_release(t), time); } static void advance_subtasks(quanta_t time) { int cpu, missed; struct task_struct* l; struct pfair_param* p; for_each_online_cpu(cpu) { l = pstate[cpu]->linked; missed = pstate[cpu]->linked != pstate[cpu]->local; if (l) { p = tsk_pfair(l); p->last_quantum = time; p->last_cpu = cpu; if (advance_subtask(time, l, cpu)) { pstate[cpu]->linked = NULL; pfair_add_release(l); } } } } static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu) { int cpu; if (tsk_rt(t)->scheduled_on != NO_CPU) { /* always observe scheduled_on linkage */ default_cpu = tsk_rt(t)->scheduled_on; PTRACE_TASK(t, "forced on %d (scheduled on)\n", default_cpu); } else if (tsk_pfair(t)->last_quantum == time - 1) { /* back2back quanta */ /* Only observe last_quantum if no scheduled_on is in the way. * This should only kick in if a CPU missed quanta, and that * *should* only happen in QEMU. */ cpu = tsk_pfair(t)->last_cpu; if (!pstate[cpu]->linked || tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) { default_cpu = cpu; PTRACE_TASK(t, "forced on %d (linked on)\n", default_cpu); } else { PTRACE_TASK(t, "DID NOT force on %d (linked on)\n", default_cpu); } } return default_cpu; } /* returns one if linking was redirected */ static int pfair_link(quanta_t time, int cpu, struct task_struct* t) { int target = target_cpu(time, t, cpu); struct task_struct* prev = pstate[cpu]->linked; struct task_struct* other; PTRACE_TASK(t, "linked to %d for quantum %lu\n", target, time); if (target != cpu) { other = pstate[target]->linked; pstate[target]->linked = t; tsk_rt(t)->linked_on = target; if (!other) /* linked ok, but reschedule this CPU */ return 1; if (target < cpu) { /* link other to cpu instead */ tsk_rt(other)->linked_on = cpu; pstate[cpu]->linked = other; if (prev) { /* prev got pushed back into the ready queue */ tsk_rt(prev)->linked_on = NO_CPU; __add_ready(&pfair, prev); } /* we are done with this cpu */ return 0; } else { /* re-add other, it's original CPU was not considered yet */ tsk_rt(other)->linked_on = NO_CPU; __add_ready(&pfair, other); /* reschedule this CPU */ return 1; } } else { pstate[cpu]->linked = t; tsk_rt(t)->linked_on = cpu; if (prev) { /* prev got pushed back into the ready queue */ tsk_rt(prev)->linked_on = NO_CPU; __add_ready(&pfair, prev); } /* we are done with this CPU */ return 0; } } static void schedule_subtasks(quanta_t time) { int cpu, retry; for_each_online_cpu(cpu) { retry = 1; while (retry) { if (pfair_higher_prio(__peek_ready(&pfair), pstate[cpu]->linked)) retry = pfair_link(time, cpu, __take_ready(&pfair)); else retry = 0; } } } static void schedule_next_quantum(quanta_t time) { int cpu; PTRACE("<<< Q %lu at %llu\n", time, litmus_clock()); /* called with interrupts disabled */ spin_lock(&pfair_lock); advance_subtasks(time); poll_releases(time); schedule_subtasks(time); spin_unlock(&pfair_lock); /* We are done. Advance time. */ mb(); for (cpu = 0; cpu < NR_CPUS; cpu++) pstate[cpu]->cur_tick = pfair_time; PTRACE(">>> Q %lu at %llu\n", time, litmus_clock()); } /* pfair_tick - this function is called for every local timer * interrupt. */ static void pfair_tick(struct task_struct* t) { struct pfair_state* state = &__get_cpu_var(pfair_state); quanta_t time, loc, cur; /* Attempt to advance time. First CPU to get here 00 * will prepare the next quantum. */ time = cmpxchg(&pfair_time, state->local_tick, /* expected */ state->local_tick + 1 /* next */ ); if (time == state->local_tick) /* exchange succeeded */ schedule_next_quantum(time + 1); /* Spin locally until time advances. */ while (1) { mb(); cur = state->cur_tick; loc = state->local_tick; if (time_before(loc, cur)) { if (loc + 1 != cur) { TRACE("MISSED quantum! loc:%lu -> cur:%lu\n", loc, cur); state->missed_quanta++; } break; } cpu_relax(); } /* copy state info */ state->local_tick = state->cur_tick; state->local = state->linked; if (state->local && tsk_pfair(state->local)->present && state->local != current) set_tsk_need_resched(current); } static int safe_to_schedule(struct task_struct* t, int cpu) { int where = tsk_rt(t)->scheduled_on; if (where != NO_CPU && where != cpu) { TRACE_TASK(t, "BAD: can't be scheduled on %d, " "scheduled already on %d.\n", cpu, where); return 0; } else return tsk_pfair(t)->present && get_rt_flags(t) == RT_F_RUNNING; } static struct task_struct* pfair_schedule(struct task_struct * prev) { struct pfair_state* state = &__get_cpu_var(pfair_state); int blocks; struct task_struct* next = NULL; spin_lock(&pfair_lock); blocks = is_realtime(prev) && !is_running(prev); if (blocks) tsk_pfair(prev)->present = 0; if (state->local && safe_to_schedule(state->local, state->cpu)) next = state->local; if (prev != next) { tsk_rt(prev)->scheduled_on = NO_CPU; if (next) tsk_rt(next)->scheduled_on = state->cpu; } spin_unlock(&pfair_lock); if (next) TRACE_TASK(next, "scheduled rel=%lu at %lu\n", tsk_pfair(next)->release, pfair_time); else if (is_realtime(prev)) TRACE("Becomes idle at %lu\n", pfair_time); return next; } static void pfair_task_new(struct task_struct * t, int on_rq, int running) { unsigned long flags; TRACE("pfair: task new %d state:%d\n", t->pid, t->state); spin_lock_irqsave(&pfair_lock, flags); if (running) t->rt_param.scheduled_on = task_cpu(t); else t->rt_param.scheduled_on = NO_CPU; prepare_release(t, pfair_time + 1); tsk_pfair(t)->present = running; pfair_add_release(t); check_preempt(t); spin_unlock_irqrestore(&pfair_lock, flags); } static void pfair_task_wake_up(struct task_struct *t) { unsigned long flags; TRACE_TASK(t, "wakes at %lld, release=%lu, pfair_time:%lu\n", cur_release(t), pfair_time); spin_lock_irqsave(&pfair_lock, flags); tsk_pfair(t)->present = 1; /* It is a little unclear how to deal with Pfair * tasks that block for a while and then wake. * For now, we assume that such suspensions are included * in the stated execution time of the task, and thus * count as execution time for our purposes. Thus, if the * task is currently linked somewhere, it may resume, otherwise * it has to wait for its next quantum allocation. */ check_preempt(t); spin_unlock_irqrestore(&pfair_lock, flags); } static void pfair_task_block(struct task_struct *t) { BUG_ON(!is_realtime(t)); TRACE_TASK(t, "blocks at %lld, state:%d\n", (lt_t) jiffies, t->state); } /* caller must hold pfair_lock */ static void drop_all_references(struct task_struct *t) { int cpu; struct pfair_state* s; struct heap* q; if (heap_node_in_heap(tsk_rt(t)->heap_node)) { /* figure out what queue the node is in */ if (time_before_eq(cur_release(t), merge_time)) q = &pfair.ready_queue; else q = relq(cur_release(t)); heap_delete(pfair_ready_order, q, tsk_rt(t)->heap_node); } for (cpu = 0; cpu < NR_CPUS; cpu++) { s = &per_cpu(pfair_state, cpu); if (s->linked == t) s->linked = NULL; if (s->local == t) s->local = NULL; if (s->scheduled == t) s->scheduled = NULL; } } static void pfair_task_exit(struct task_struct * t) { unsigned long flags; BUG_ON(!is_realtime(t)); /* Remote task from release or ready queue, and ensure * that it is not the scheduled task for ANY CPU. We * do this blanket check because occassionally when * tasks exit while blocked, the task_cpu of the task * might not be the same as the CPU that the PFAIR scheduler * has chosen for it. */ spin_lock_irqsave(&pfair_lock, flags); TRACE_TASK(t, "RIP, state:%d\n", t->state); drop_all_references(t); spin_unlock_irqrestore(&pfair_lock, flags); kfree(t->rt_param.pfair); t->rt_param.pfair = NULL; } static void pfair_release_at(struct task_struct* task, lt_t start) { unsigned long flags; lt_t now = litmus_clock(); quanta_t release, delta; BUG_ON(!is_realtime(task)); spin_lock_irqsave(&pfair_lock, flags); if (lt_before(now, start)) { delta = time2quanta((long long) start - (long long) now, CEIL); if (delta >= PFAIR_MAX_PERIOD) delta = PFAIR_MAX_PERIOD - 1; } else delta = 10; /* release in 10 ticks */ release = pfair_time + delta; drop_all_references(task); prepare_release(task, release); pfair_add_release(task); spin_unlock_irqrestore(&pfair_lock, flags); } static void init_subtask(struct subtask* sub, unsigned long i, lt_t quanta, lt_t period) { /* since i is zero-based, the formulas are shifted by one */ lt_t tmp; /* release */ tmp = period * i; do_div(tmp, quanta); /* floor */ sub->release = (quanta_t) tmp; /* deadline */ tmp = period * (i + 1); if (do_div(tmp, quanta)) /* ceil */ tmp++; sub->deadline = (quanta_t) tmp; /* next release */ tmp = period * (i + 1); do_div(tmp, quanta); /* floor */ sub->overlap = sub->deadline - (quanta_t) tmp; /* Group deadline. * Based on the formula given in Uma's thesis. */ if (2 * quanta >= period) { /* heavy */ tmp = (sub->deadline - (i + 1)) * period; if (do_div(tmp, (period - quanta))) /* ceil */ tmp++; sub->group_deadline = (quanta_t) tmp; } else sub->group_deadline = 0; } static void dump_subtasks(struct task_struct* t) { unsigned long i; for (i = 0; i < t->rt_param.pfair->quanta; i++) TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n", i + 1, t->rt_param.pfair->subtasks[i].release, t->rt_param.pfair->subtasks[i].deadline, t->rt_param.pfair->subtasks[i].overlap, t->rt_param.pfair->subtasks[i].group_deadline); } static long pfair_admit_task(struct task_struct* t) { lt_t quanta; lt_t period; s64 quantum_length = ktime_to_ns(tick_period); struct pfair_param* param; unsigned long i; /* Pfair is a tick-based method, so the time * of interest is jiffies. Calculate tick-based * times for everything. * (Ceiling of exec cost, floor of period.) */ quanta = get_exec_cost(t); period = get_rt_period(t); quanta = time2quanta(get_exec_cost(t), CEIL); if (do_div(period, quantum_length)) printk(KERN_WARNING "The period of %s/%d is not a multiple of %llu.\n", t->comm, t->pid, (unsigned long long) quantum_length); if (period >= PFAIR_MAX_PERIOD) { printk(KERN_WARNING "PFAIR: Rejecting task %s/%d; its period is too long.\n", t->comm, t->pid); return -EINVAL; } param = kmalloc(sizeof(struct pfair_param) + quanta * sizeof(struct subtask), GFP_ATOMIC); if (!param) return -ENOMEM; param->quanta = quanta; param->cur = 0; param->release = 0; param->period = period; for (i = 0; i < quanta; i++) init_subtask(param->subtasks + i, i, quanta, period); if (t->rt_param.pfair) /* get rid of stale allocation */ kfree(t->rt_param.pfair); t->rt_param.pfair = param; /* spew out some debug info */ dump_subtasks(t); return 0; } /* Plugin object */ static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = { .plugin_name = "PFAIR", .tick = pfair_tick, .task_new = pfair_task_new, .task_exit = pfair_task_exit, .schedule = pfair_schedule, .task_wake_up = pfair_task_wake_up, .task_block = pfair_task_block, .admit_task = pfair_admit_task, .release_at = pfair_release_at, .complete_job = complete_job }; static int __init init_pfair(void) { int cpu, i; struct pfair_state *state; /* initialize release queue */ for (i = 0; i < PFAIR_MAX_PERIOD; i++) heap_init(&release_queue[i]); /* initialize CPU state */ for (cpu = 0; cpu < NR_CPUS; cpu++) { state = &per_cpu(pfair_state, cpu); state->cpu = cpu; state->cur_tick = 0; state->local_tick = 0; state->linked = NULL; state->local = NULL; state->scheduled = NULL; state->missed_quanta = 0; pstate[cpu] = state; } rt_domain_init(&pfair, pfair_ready_order, NULL, NULL); return register_sched_plugin(&pfair_plugin); } module_init(init_pfair);