/* * kernel/sched_adaptive.c * * Implementation of Aaron's adaptive global EDF scheduling algorithm. It is * based on the GSN-EDF scheduler. However, it does not support synchronization * primitives. * * It implements a version of FC-GEDF with a bunch of linearity assumptions for * the optimizer and the the weight-transfer function. The code is meant to be * clear, however you really need to read the paper if you want to understand * what is going on here. * * Block et al., "Feedback-Controlled Adaptive Multiprocessor Real-Time * Systems", submitted to RTAS 2008. */ #include #include #include #include #include #include #include #include #include #include /* Overview of GSN-EDF operations. * * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This * description only covers how the individual operations are implemented in * LITMUS. * * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage * structure (NOT the actually scheduled * task). If there is another linked task To * already it will set To->linked_on = NO_CPU * (thereby removing its association with this * CPU). However, it will not requeue the * previously linked task (if any). It will set * T's state to RT_F_RUNNING and check whether * it is already running somewhere else. If T * is scheduled somewhere else it will link * it to that CPU instead (and pull the linked * task to cpu). T may be NULL. * * unlink(T) - Unlink removes T from all scheduler data * structures. If it is linked to some CPU it * will link NULL to that CPU. If it is * currently queued in the gsnedf queue it will * be removed from the T->rt_list. It is safe to * call unlink(T) if T is not linked. T may not * be NULL. * * requeue(T) - Requeue will insert T into the appropriate * queue. If the system is in real-time mode and * the T is released already, it will go into the * ready queue. If the system is not in * real-time mode is T, then T will go into the * release queue. If T's release time is in the * future, it will go into the release * queue. That means that T's release time/job * no/etc. has to be updated before requeu(T) is * called. It is not safe to call requeue(T) * when T is already queued. T may not be NULL. * * gsnedf_job_arrival(T) - This is the catch all function when T enters * the system after either a suspension or at a * job release. It will queue T (which means it * is not safe to call gsnedf_job_arrival(T) if * T is already queued) and then check whether a * preemption is necessary. If a preemption is * necessary it will update the linkage * accordingly and cause scheduled to be called * (either with an IPI or need_resched). It is * safe to call gsnedf_job_arrival(T) if T's * next job has not been actually released yet * (releast time in the future). T will be put * on the release queue in that case. * * job_completion(T) - Take care of everything that needs to be done * to prepare T for its next release and place * it in the right queue with * gsnedf_job_arrival(). * * * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is * equivalent to unlink(T). Note that if you unlink a task from a CPU none of * the functions will automatically propagate pending task from the ready queue * to a linked task. This is the job of the calling function ( by means of * __take_ready). */ static void unlink(struct task_struct* t); static void adaptive_job_arrival(struct task_struct* task); /* cpu_entry_t - maintain the linked and scheduled state */ typedef struct { int cpu; struct task_struct* linked; /* only RT tasks */ struct task_struct* scheduled; /* only RT tasks */ struct list_head list; atomic_t will_schedule; /* prevent unneeded IPIs */ } cpu_entry_t; DEFINE_PER_CPU(cpu_entry_t, adaptive_cpu_entries); #define set_will_schedule() \ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 1)) #define clear_will_schedule() \ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 0)) #define test_will_schedule(cpu) \ (atomic_read(&per_cpu(adaptive_cpu_entries, cpu).will_schedule)) #define NO_CPU 0xffffffff /* The gsnedf_lock is used to serialize all scheduling events. * It protects */ static queuelock_t adaptive_lock; /* the cpus queue themselves according to priority in here */ static LIST_HEAD(adaptive_cpu_queue); static rt_domain_t adaptive; /* feedback control parameters */ static fp_t fc_a, fc_b; /* optimizer trigger */ static jiffie_t last_optimizer_run; static jiffie_t optimizer_min_invocation_sep; static jiffie_t optimizer_period; static fp_t task_error_threshold; static fp_t system_capacity; /* total actual weight of the task system */ static fp_t total_weight; /* optimizer time snapshot */ jiffie_t opt_time; /* Delayed weight increase notification list. * This list gets clobbered on each optimizer run. */ static LIST_HEAD(adaptive_inc_list); /* comment out to disable optimizer debugging */ #define ENABLE_OPTIMIZER_DEBUGGING #ifdef ENABLE_OPTIMIZER_DEBUGGING #define OPT_DBG TRACE #define OPT_DBG_T TRACE_TASK #else #define OPT_DBG #define OPT_DBG_T OPT_D #endif /******************************************************************************/ /* OPTIMIZER MATH */ /******************************************************************************/ /* All time dependent functions * rely on opt_time. * Update in the optimizer before use! */ static inline fp_t ideal(fp_t weight, jiffie_t delta_t) { return _mul(weight, FP(delta_t)); } static noinline long ideal_exec_time(struct task_struct* t) { jiffie_t delta = opt_time - get_last_release(t); return _round(ideal(get_est_weight(t), delta)); } /* this makes a whole bunch of linearity assumptions */ static noinline fp_t weight_transfer(struct task_struct* t, unsigned int from, unsigned int to, fp_t act_weight) { fp_t rel_from, rel_to, ret; rel_from = get_sl(t, from).weight; rel_to = get_sl(t, to).weight; ret.val = (act_weight.val * rel_to.val) / rel_from.val; OPT_DBG("weight_transfer(%ld, %ld, %ld) => %ld to=%u from=%u\n", rel_from.val, rel_to.val, act_weight.val, ret.val, from, to); return ret; } static noinline fp_t est_weight_at(struct task_struct* t, unsigned int level) { if (t->rt_param.no_service_levels) return weight_transfer(t, get_cur_sl(t), level, get_est_weight(t)); else return get_est_weight(t); } static noinline void update_estimate(predictor_state_t *state, fp_t actual_weight, fp_t a, fp_t b) { fp_t err, new; OPT_DBG("OLD ESTIMATE Weight" _FP_ " ActWt " _FP_ " A:" _FP_ ", B:" _FP_ "\n", fp2str(state->estimate), fp2str(actual_weight), fp2str(a), fp2str(b)); err = _sub(actual_weight, state->estimate); new = _add(_mul(a, err), _mul(b, state->accumulated)); total_weight = _sub(total_weight, state->estimate); state->estimate = new; total_weight = _add(total_weight, state->estimate); state->accumulated = _add(state->accumulated, err); OPT_DBG("ERROR " _FP_ ", NEW " _FP_ ", ACC" _FP_ "\n", fp2str(err), fp2str(new), fp2str(state->accumulated)); } static noinline fp_t linear_metric(struct task_struct* t) { fp_t v1, vmax, g1, gmax; fp_t est_w; unsigned int l = t->rt_param.no_service_levels; unsigned int lcur; if (l <= 1) return FP(0); lcur = get_cur_sl(t);; est_w = get_est_weight(t); OPT_DBG_T(t, " linear_metric: lcur=%u l=%u est_w=" _FP_ "\n", lcur, l, est_w); OPT_DBG_T(t, " linear_metric: est_w.val=%ld\n", est_w.val); v1 = t->rt_param.service_level[0].value; vmax = t->rt_param.service_level[l - 1].value; OPT_DBG_T(t, " linear_metric: v1=" _FP_ " vmax=" _FP_ "\n", v1, vmax); OPT_DBG_T(t, " linear_metric: v1=%ld vmax=%ld\n", v1.val, vmax.val); g1 = weight_transfer(t, lcur, 0, est_w); gmax = weight_transfer(t, lcur, l - 1, est_w); OPT_DBG_T(t, " linear_metric: g1=" _FP_ " gmax=" _FP_ "\n", g1, gmax); OPT_DBG_T(t, " linear_metric: g1=%ld gmax=%ld\n", g1, gmax); TRACE_BUG_ON(_eq(_sub(gmax, g1), FP(0))); if (_eq(_sub(gmax, g1), FP(0))) return FP(0); return _div(_sub(vmax, v1), _sub(gmax, g1)); } static noinline unsigned long reweighted_period(fp_t ow, fp_t nw, unsigned long alloc, jiffie_t deadline, jiffie_t release) { fp_t dl; dl = _mul(FP(deadline - release), ow); dl = _sub(dl, FP(alloc)); if(_eq(nw, FP(0))) return 0; dl = _div(dl, nw); return _round(dl); } static noinline int is_under_allocated(struct task_struct* t) { return ideal_exec_time(t) >= t->rt_param.times.exec_time; } static noinline jiffie_t dec_equal_point_delay(struct task_struct* t) { if (_lt(FP(0), get_est_weight(t))) /* when t was released plus time needed to equalize * minus now */ return get_last_release(t) + _round(_div( FP(t->rt_param.times.exec_time), get_est_weight(t))) - opt_time; else /* if the weight is zero we just take the * deadline */ return t->rt_param.times.deadline; } static noinline jiffie_t inc_equal_point_delay(struct task_struct* t) { if (_lt(FP(0), t->rt_param.opt_nw)) /* when t was released plus time needed to equalize * minus now */ return get_last_release(t) + _round(_div( FP(t->rt_param.times.exec_time), t->rt_param.opt_nw)) - opt_time; else /* if the weight is zero we just take the * deadline */ return t->rt_param.times.deadline; } static noinline jiffie_t decrease_delay(struct task_struct* t) { if (has_active_job(t) && !is_under_allocated(t)) return dec_equal_point_delay(t); return 0; } /******************************************************************************/ /* SORT ORDERS */ /******************************************************************************/ static int by_linear_metric(struct list_head* a, struct list_head* b) { struct task_struct *ta, *tb; ta = list_entry(a, struct task_struct, rt_param.opt_list); tb = list_entry(b, struct task_struct, rt_param.opt_list); return _gt(ta->rt_param.opt_order, tb->rt_param.opt_order); } static int by_delta_weight(struct list_head* a, struct list_head* b) { struct task_struct *ta, *tb; ta = list_entry(a, struct task_struct, rt_param.opt_list); tb = list_entry(b, struct task_struct, rt_param.opt_list); return _lt(ta->rt_param.opt_dw, tb->rt_param.opt_dw); } static int by_enactment_time(struct list_head* a, struct list_head* b) { struct task_struct *ta, *tb; ta = list_entry(a, struct task_struct, rt_param.opt_list); tb = list_entry(b, struct task_struct, rt_param.opt_list); return ta->rt_param.opt_change < tb->rt_param.opt_change; } /******************************************************************************/ /* WEIGHT CHANGE MECHANICS */ /******************************************************************************/ static void set_service_level(struct task_struct* t, unsigned int level) { service_level_t *new; unsigned int old; BUG_ON(!t); BUG_ON(t->rt_param.no_service_levels <= level); old = t->rt_param.cur_service_level; t->rt_param.cur_service_level = level; new = t->rt_param.service_level + level; t->rt_param.basic_params.period = new->period; t->rt_param.basic_params.exec_cost = _round(_mul(new->weight, FP(new->period))); scheduler_signal(t, SIGUSR1); sched_trace_service_level_change(t, old, level); OPT_DBG_T(t, "service level %u activated\n", level); } /* call this _before_ updating deadline and release of t */ static void update_weight_estimate(struct task_struct* t) { fp_t nw, ow; jiffie_t sl_period, exec_time; ow = get_est_weight(t); nw = t->rt_param.opt_nw; exec_time = t->rt_param.times.exec_time; sl_period = get_sl(t, get_opt_sl(t)).period; OPT_DBG("ow=" _FP_ " nw=" _FP_ ", r-d " _FP_ ", deadline %d, release %d, exec_time=%ld sl_period=%lu\n", fp2str(ow), fp2str(nw), fp2str(FP(get_deadline(t) - get_last_release(t))), get_deadline(t), get_last_release(t), exec_time, sl_period); total_weight = _sub(total_weight, get_est_weight(t)); t->rt_param.predictor_state.estimate = nw; OPT_DBG_T(t, "update_weight_estimate from " _FP_ " to "_FP_"\n", fp2str(ow), fp2str(nw)); total_weight = _add(total_weight, get_est_weight(t)); OPT_DBG_T(t, " update_weight_estimate: " _FP_ " => " _FP_ "\n", fp2str(ow), fp2str(get_est_weight(t))); } static void decrease_weight(struct task_struct* t) { fp_t ow, nw; jiffie_t last, period, delay; ow = get_sl(t, get_cur_sl(t)).weight; nw = get_sl(t, get_opt_sl(t)).weight; last = t->rt_param.times.last_release; period = reweighted_period(ow, nw, t->rt_param.times.exec_time, t->rt_param.times.deadline, last); /* necessary delay has already been computed by optimizer */ delay = t->rt_param.opt_change; update_weight_estimate(t); if (!delay) t->rt_param.times.last_release = opt_time; t->rt_param.times.release = opt_time + delay; t->rt_param.times.deadline = opt_time + delay + period; set_service_level(t, get_opt_sl(t)); /* take out of queue/link structure */ unlink(t); /* present as a new job */ adaptive_job_arrival(t); } static void increase_weight(struct task_struct* t) { fp_t ow, nw; jiffie_t last, period, delay; ow = get_sl(t, get_cur_sl(t)).weight; nw = get_sl(t, get_opt_sl(t)).weight; last = t->rt_param.times.last_release; period = reweighted_period(ow, nw, t->rt_param.times.exec_time, t->rt_param.times.deadline, last); if (t->rt_param.opt_change == 0) { /* can be enacted now */ if (is_under_allocated(t) || time_before(opt_time + period, get_deadline(t))) /* do it now */ delay = 0; else { if (is_under_allocated(t)) { t->rt_param.opt_change += opt_time; /* The next job release will notice that opt != * sl and initiate a weight change. */ return; } else /* nope, wait for equal point */ delay = inc_equal_point_delay(t); } update_weight_estimate(t); if (!delay) t->rt_param.times.last_release = opt_time; t->rt_param.times.release = opt_time + delay; t->rt_param.times.deadline = opt_time + delay + period; set_service_level(t, get_opt_sl(t)); /* take out of queue/link structure */ unlink(t); /* present as a new job */ adaptive_job_arrival(t); } else { /* must wait until capacity is released */ t->rt_param.opt_change += opt_time; list_insert(&t->rt_param.opt_list, &adaptive_inc_list, by_enactment_time); } } static void delayed_increase_weight(void) { struct list_head *p, *extra; struct task_struct* t; opt_time = jiffies; list_for_each_safe(p, extra, &adaptive_inc_list) { t = list_entry(p, struct task_struct, rt_param.opt_list); if (time_before_eq(t->rt_param.opt_change, opt_time)) { list_del(p); /* prevent recursion */ t->rt_param.opt_change = 0; /* this takes care of everything */ increase_weight(t); } else /* list is sorted */ break; } } static void change_weight(struct task_struct* t) { if (get_cur_sl(t) < get_opt_sl(t)) increase_weight(t); else decrease_weight(t); OPT_DBG_T(t, "after change_weight: last_rel:%d rel:%d dl:%d\n", get_last_release(t), get_release(t), get_deadline(t)); } /******************************************************************************/ /* OPTIMIZER */ /******************************************************************************/ /* only invoke with adaptive_lock behing held */ void adaptive_optimize(void) { struct list_head list; struct list_head inc, dec; struct list_head *p, *extra; cpu_entry_t *cpu; struct task_struct* t; fp_t M = FP(0), w0, wl, tmp, estU = FP(0); unsigned int l; jiffie_t enactment_time; if (time_before(jiffies, last_optimizer_run + optimizer_min_invocation_sep)) return; OPT_DBG(":::::: running adaptive optimizer\n"); opt_time = jiffies; INIT_LIST_HEAD(&list); /* 1) gather all tasks */ list_for_each(p, &adaptive.ready_queue) list_add(&(rt_list2task(p)->rt_param.opt_list), &list); list_for_each(p, &adaptive.release_queue) list_add(&(rt_list2task(p)->rt_param.opt_list), &list); list_for_each(p, &adaptive_cpu_queue) { cpu = list_entry(p, cpu_entry_t, list); if (cpu->linked) list_add(&cpu->linked->rt_param.opt_list, &list); } /* 2) determine current system capacity */ M = system_capacity; OPT_DBG("opt: system capacity: " _FP_ "\n", fp2str(M)); /* 3) Compute L value for all tasks, * and set tasks to service level 0, * also account for weight. * Also establish current estimated utilization */ list_for_each_safe(p, extra, &list) { t = list_entry(p, struct task_struct, rt_param.opt_list); if (time_before(opt_time, get_last_release(t))) { list_del(p); continue; } t->rt_param.opt_order = linear_metric(t); OPT_DBG_T(t, "est_w = " _FP_ " L = " _FP_ "\n", get_est_weight(t), fp2str(t->rt_param.opt_order)); t->rt_param.opt_level = 0; M = _sub(M, est_weight_at(t, 0)); estU = _add(estU, get_est_weight(t)); } OPT_DBG("opt: estimated utilization: " _FP_ "\n", fp2str(estU)); OPT_DBG("opt: estimated capacity at all sl=0: " _FP_ "\n", fp2str(M)); /* 4) sort list by decreasing linear metric */ list_qsort(&list, by_linear_metric); /* 5) assign each task a service level */ list_for_each(p, &list) { t = list_entry(p, struct task_struct, rt_param.opt_list); l = t->rt_param.no_service_levels; w0 = est_weight_at(t, 0); while (l > 1) { l--; wl = est_weight_at(t, l); tmp = _sub(M, _sub(wl, w0)); if (_leq(FP(0), tmp)) { /* this level fits in */ M = tmp; t->rt_param.opt_level = l; t->rt_param.opt_dw = _sub(wl, get_est_weight(t)); t->rt_param.opt_nw = wl; break; /* proceed to next task */ } } OPT_DBG_T(t, " will run at sl=%u, prior=%u dw=" _FP_ "\n", l, get_cur_sl(t), fp2str(t->rt_param.opt_dw)); } /* 6) filter tasks that reweight */ INIT_LIST_HEAD(&inc); INIT_LIST_HEAD(&dec); list_for_each_safe(p, extra, &list) { t = list_entry(p, struct task_struct, rt_param.opt_list); list_del(p); if (t->rt_param.opt_level < get_cur_sl(t)) { list_add(p, &dec); t->rt_param.opt_change = decrease_delay(t); } else if (t->rt_param.opt_level > get_cur_sl(t)) { list_add(p, &inc); t->rt_param.opt_change = 0; } /* if t doesn't change we can ignore it from now on */ } /* 7) sort dec and inc list */ list_qsort(&dec, by_enactment_time); list_qsort(&inc, by_delta_weight); /* 8) now figure out when we can enact weight increases * It works like this: We know the current system utilization. * Thus, we know the remaining capacity. We also know when * decreases are going to be enacted (=> capacity increases). * Now we only need to find a spot where the weight increase will * not drive the system into overload. */ /* Very ugly jump, but we need to force enactment_time = 0 * during the first iteration. */ M = system_capacity; enactment_time = 0; goto first_iteration; while (!list_empty(&inc)) { enactment_time = list_entry(dec.next, struct task_struct, rt_param.opt_list) ->rt_param.opt_change; first_iteration: /* Start by collapsing the next decrease. * Except for in the first iteration, it will always * pick off at least one task. */ list_for_each_safe(p, extra, &dec) { t = list_entry(p, struct task_struct, rt_param.opt_list); if (t->rt_param.opt_change == enactment_time) { list_del(p); /* opt_dw is negative */ estU = _add(estU, t->rt_param.opt_dw); list_add(p, &list); OPT_DBG_T(t, " weight decrease at %ld => estU=" _FP_ "\n", enactment_time, fp2str(estU)); } else /* stop decrease loop */ break; } /* now start setting enactment times for increases */ while (!list_empty(&inc)) { p = inc.next; t = list_entry(p, struct task_struct, rt_param.opt_list); tmp = _add(estU, t->rt_param.opt_dw); if (_leq(tmp, M)) { /* it fits */ estU = tmp; t->rt_param.opt_change = enactment_time; list_del(p); list_add(p, &list); OPT_DBG_T(t, " weight increase at %ld => estU=" _FP_ "\n", enactment_time, fp2str(estU)); } else /* stop increase loop */ break; } TRACE_BUG_ON(list_empty(&dec) && !list_empty(&inc)); if (list_empty(&dec) && !list_empty(&inc)) /* break out in case of bug */ break; } /* 9) Wow. We made it. Every task has a now a new service level * assigned, together with a correct (earliest) enactment time. * all we have left to do now is to enact changes that did not get * delayed. Also convert change fields to actual timestamp for to be * nice to the scheduler_tick(). */ INIT_LIST_HEAD(&adaptive_inc_list); list_for_each_safe(p, extra, &list) { t = list_entry(p, struct task_struct, rt_param.opt_list); list_del(p); change_weight(t); } last_optimizer_run = jiffies; OPT_DBG(":::::: optimizer run complete\n"); } /* update_cpu_position - Move the cpu entry to the correct place to maintain * order in the cpu queue. Caller must hold adaptive lock. */ static void update_cpu_position(cpu_entry_t *entry) { cpu_entry_t *other; struct list_head *pos; list_del(&entry->list); /* if we do not execute real-time jobs we just move * to the end of the queue */ if (entry->linked) { list_for_each(pos, &adaptive_cpu_queue) { other = list_entry(pos, cpu_entry_t, list); if (edf_higher_prio(entry->linked, other->linked)) { __list_add(&entry->list, pos->prev, pos); return; } } } /* if we get this far we have the lowest priority job */ list_add_tail(&entry->list, &adaptive_cpu_queue); } /* link_task_to_cpu - Update the link of a CPU. * Handles the case where the to-be-linked task is already * scheduled on a different CPU. */ static noinline void link_task_to_cpu(struct task_struct* linked, cpu_entry_t *entry) { cpu_entry_t *sched; struct task_struct* tmp; int on_cpu; BUG_ON(linked && !is_realtime(linked)); /* Currently linked task is set to be unlinked. */ if (entry->linked) entry->linked->rt_param.linked_on = NO_CPU; /* Link new task to CPU. */ if (linked) { set_rt_flags(linked, RT_F_RUNNING); /* handle task is already scheduled somewhere! */ on_cpu = linked->rt_param.scheduled_on; if (on_cpu != NO_CPU) { sched = &per_cpu(adaptive_cpu_entries, on_cpu); /* this should only happen if not linked already */ BUG_ON(sched->linked == linked); /* If we are already scheduled on the CPU to which we * wanted to link, we don't need to do the swap -- * we just link ourselves to the CPU and depend on * the caller to get things right. */ if (entry != sched) { tmp = sched->linked; linked->rt_param.linked_on = sched->cpu; sched->linked = linked; update_cpu_position(sched); linked = tmp; } } if (linked) /* might be NULL due to swap */ linked->rt_param.linked_on = entry->cpu; } entry->linked = linked; update_cpu_position(entry); } /* unlink - Make sure a task is not linked any longer to an entry * where it was linked before. Must hold adaptive_lock. */ static void unlink(struct task_struct* t) { cpu_entry_t *entry; if (unlikely(!t)) { TRACE_BUG_ON(!t); return; } if (t->rt_param.linked_on != NO_CPU) { /* unlink */ entry = &per_cpu(adaptive_cpu_entries, t->rt_param.linked_on); t->rt_param.linked_on = NO_CPU; link_task_to_cpu(NULL, entry); } else if (in_list(&t->rt_list)) { /* This is an interesting situation: t is scheduled, * but was just recently unlinked. It cannot be * linked anywhere else (because then it would have * been relinked to this CPU), thus it must be in some * queue. We must remove it from the list in this * case. */ list_del(&t->rt_list); } } /* preempt - force a CPU to reschedule */ static noinline void preempt(cpu_entry_t *entry) { /* We cannot make the is_np() decision here if it is a remote CPU * because requesting exit_np() requires that we currently use the * address space of the task. Thus, in the remote case we just send * the IPI and let schedule() handle the problem. */ if (smp_processor_id() == entry->cpu) { if (entry->scheduled && is_np(entry->scheduled)) request_exit_np(entry->scheduled); else set_tsk_need_resched(current); } else /* in case that it is a remote CPU we have to defer the * the decision to the remote CPU * FIXME: We could save a few IPI's here if we leave the flag * set when we are waiting for a np_exit(). */ if (!test_will_schedule(entry->cpu)) smp_send_reschedule(entry->cpu); } /* requeue - Put an unlinked task into gsn-edf domain. * Caller must hold adaptive_lock. */ static noinline void requeue(struct task_struct* task) { BUG_ON(!task); /* sanity check rt_list before insertion */ BUG_ON(in_list(&task->rt_list)); if (get_rt_flags(task) == RT_F_SLEEP || get_rt_mode() != MODE_RT_RUN) { /* this task has expired * _schedule has already taken care of updating * the release and * deadline. We just must check if it has been released. */ if (is_released(task) && get_rt_mode() == MODE_RT_RUN) __add_ready(&adaptive, task); else { /* it has got to wait */ __add_release(&adaptive, task); } } else /* this is a forced preemption * thus the task stays in the ready_queue * we only must make it available to others */ __add_ready(&adaptive, task); } /* adaptive_job_arrival: task is either resumed or released */ static void adaptive_job_arrival(struct task_struct* task) { cpu_entry_t* last; BUG_ON(list_empty(&adaptive_cpu_queue)); BUG_ON(!task); TRACE_TASK(task, "job_arrival: last_rel=%d rel=%d dl=%d now=%d\n", get_last_release(task), get_release(task), get_deadline(task), jiffies); /* first queue arriving job */ requeue(task); /* then check for any necessary preemptions */ last = list_entry(adaptive_cpu_queue.prev, cpu_entry_t, list); if (edf_preemption_needed(&adaptive, last->linked)) { /* preemption necessary */ task = __take_ready(&adaptive); TRACE("job_arrival: task %d linked to %d\n", task->pid, last->cpu); if (last->linked) requeue(last->linked); link_task_to_cpu(task, last); preempt(last); } } /* check for current job releases */ static noinline void adaptive_release_jobs(void) { struct list_head *pos, *save; struct task_struct *queued; list_for_each_safe(pos, save, &adaptive.release_queue) { queued = list_entry(pos, struct task_struct, rt_list); if (likely(is_released(queued))) { TRACE_TASK(queued, "released rel=%d now=%d\n", get_release(queued), jiffies); /* this one is ready to go*/ list_del(pos); set_rt_flags(queued, RT_F_RUNNING); queued->rt_param.times.last_release = queued->rt_param.times.release; /* check for delayed weight increase */ if (get_opt_sl(queued) != get_cur_sl(queued) && time_before_eq(queued->rt_param.opt_change, jiffies)) { opt_time = jiffies; set_service_level(queued, get_opt_sl(queued)); queued->rt_param.times.deadline = get_last_release(queued) + get_rt_period(queued); total_weight = _sub(total_weight, get_est_weight(queued)); queued->rt_param.predictor_state.estimate = queued->rt_param.opt_nw; total_weight = _add(total_weight, get_est_weight(queued)); } sched_trace_job_release(queued); adaptive_job_arrival(queued); } else /* the release queue is ordered */ break; } } /* adaptive_scheduler_tick - this function is called for every local timer * interrupt. * * checks whether the current task has expired and checks * whether we need to preempt it if it has not expired */ static reschedule_check_t adaptive_scheduler_tick(void) { unsigned long flags; struct task_struct* t = current; reschedule_check_t want_resched = NO_RESCHED; /* Account for exec time. * Since we don't preempt forcefully, nothing else needs to be done. */ if (is_realtime(t)) t->rt_param.times.exec_time++; /* only the first CPU needs to release jobs */ if (get_rt_mode() == MODE_RT_RUN) { queue_lock_irqsave(&adaptive_lock, flags); /* (1) run the optimizer if it did not trigger often enough */ if (time_before_eq(last_optimizer_run + optimizer_period, jiffies)) { OPT_DBG("adaptive: optimizing due to period threshold\n"); adaptive_optimize(); } /* (2) enact delayed weight increases */ delayed_increase_weight(); /* (3) try to release pending jobs */ adaptive_release_jobs(); /* we don't need to check linked != scheduled since * set_tsk_need_resched has been set by preempt() if necessary */ queue_unlock_irqrestore(&adaptive_lock, flags); } return want_resched; } /* caller holds adaptive_lock */ static noinline void job_completion(struct task_struct *t) { long delta; fp_t actual_weight, old_estimate; unsigned int lcurr = get_cur_sl(t); fp_t v = t->rt_param.service_level[lcurr].value; int non_zero_weight; fp_t error_percentage; int exceeds_threshold; BUG_ON(!t); TRACE_TASK(t, " completion, last_rel=%d rel=%d dl=%d now=%d " "period=%d\n", get_last_release(t), get_release(t), get_deadline(t), jiffies, get_rt_period(t)); sched_trace_job_completion(t); delta = t->rt_param.times.exec_time - t->rt_param.basic_params.exec_cost; OPT_DBG_T(t, "job %d completes, delta WCET = %d\n", t->rt_param.times.job_no, delta); actual_weight = _frac(t->rt_param.times.exec_time, t->rt_param.basic_params.period); sched_trace_weight_error(t, actual_weight); old_estimate = get_est_weight(t); update_estimate(&t->rt_param.predictor_state, actual_weight, fc_a, fc_b); OPT_DBG_T(t, "Job %d completes. Current value " _FP_ ", Weight estimation: error=" _FP_ " weight=" _FP_ " => " _FP_ "\n",t->rt_param.times.job_no, v, _sub(get_est_weight(t), old_estimate), old_estimate, get_est_weight(t)); /* Now we have determined the task error. * Next we release the next job. * Then we optimize. It's easier for the optimizer to deal * with just-released jobs. */ /* prepare for next period */ edf_prepare_for_next_period(t); TRACE_TASK(t, " prepped, last_rel=%d rel=%d dl=%d now=%d\n", get_last_release(t), get_release(t), get_deadline(t), jiffies); if (is_released(t)) { /* set flags */ /* prevent fake completions */ set_rt_flags(t, RT_F_RUNNING); t->rt_param.times.last_release = t->rt_param.times.release; } non_zero_weight = !_eq(get_est_weight(t),FP(0)); if (non_zero_weight) error_percentage = _div(_abs(_sub(get_est_weight(t), old_estimate)), get_est_weight(t)); else error_percentage = FP(0); exceeds_threshold = _gt(error_percentage, task_error_threshold); if (exceeds_threshold) { OPT_DBG("adaptive: optimizing due to task error threshold\n"); adaptive_optimize(); } else if (_gt(total_weight, system_capacity)) { OPT_DBG("adaptive: optimizing due to system capacity exceeded\n"); adaptive_optimize(); } /* unlink */ unlink(t); /* requeue * But don't requeue a blocking task. */ if (is_running(t)) adaptive_job_arrival(t); } /* Getting schedule() right is a bit tricky. schedule() may not make any * assumptions on the state of the current task since it may be called for a * number of reasons. The reasons include a scheduler_tick() determined that it * was necessary, because sys_exit_np() was called, because some Linux * subsystem determined so, or even (in the worst case) because there is a bug * hidden somewhere. Thus, we must take extreme care to determine what the * current state is. * * The CPU could currently be scheduling a task (or not), be linked (or not). * * The following assertions for the scheduled task could hold: * * - !is_running(scheduled) // the job blocks * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) * - linked != scheduled // we need to reschedule (for any reason) * * Any of these can occur together. */ static int adaptive_schedule(struct task_struct * prev, struct task_struct ** next, runqueue_t * rq) { cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries); int sleep, preempt, exists, rt, blocks; struct task_struct* linked; /* Will be released in finish_switch. */ queue_lock(&adaptive_lock); clear_will_schedule(); /* sanity checking */ BUG_ON(entry->scheduled && entry->scheduled != prev); BUG_ON(entry->scheduled && !is_realtime(prev)); /* (0) Determine state */ exists = entry->scheduled != NULL; blocks = exists && !is_running(entry->scheduled); sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; preempt = entry->scheduled != entry->linked; rt = get_rt_mode() == MODE_RT_RUN; /* If a task blocks we have no choice but to reschedule. */ if (blocks) unlink(entry->scheduled); /* Task wants to sleep -> job is done. */ if (sleep) job_completion(entry->scheduled); /* Stop real-time tasks when we leave real-time mode */ if (!rt && entry->linked) { /* task will be preempted once it is preemptable * (which it may be already) */ linked = entry->linked; unlink(linked); requeue(linked); } /* Link pending task if we became unlinked. */ if (rt && !entry->linked) link_task_to_cpu(__take_ready(&adaptive), entry); /* The final scheduling decision. Do we need to switch for some reason? * If linked different from scheduled select linked as next. */ if (entry->linked != entry->scheduled) { /* Take care of a previously scheduled * job by taking it out of the Linux runqueue. */ if (entry->scheduled) if (prev->array) /* take it out of the run queue */ deactivate_task(prev, rq); /* Schedule a linked job? */ if (entry->linked) { *next = entry->linked; /* mark the task as executing on this cpu */ set_task_cpu(*next, smp_processor_id()); /* stick the task into the runqueue */ __activate_task(*next, rq); } } else /* Only override Linux scheduler if we have real-time task * scheduled that needs to continue. */ if (exists) *next = prev; /* Unlock in case that we don't affect real-time tasks or * if nothing changed and finish_switch won't be called. */ if (prev == *next || (!is_realtime(prev) && !*next)) queue_unlock(&adaptive_lock); return 0; } /* _finish_switch - we just finished the switch away from prev */ static void adaptive_finish_switch(struct task_struct *prev) { cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries); if (is_realtime(current)) entry->scheduled = current; else entry->scheduled = NULL; prev->rt_param.scheduled_on = NO_CPU; current->rt_param.scheduled_on = smp_processor_id(); /* unlock in case schedule() left it locked */ if (is_realtime(current) || is_realtime(prev)) queue_unlock(&adaptive_lock); } /* Prepare a task for running in RT mode * Enqueues the task into master queue data structure * returns * -EPERM if task is not TASK_STOPPED */ static long adaptive_prepare_task(struct task_struct * t) { unsigned long flags; TRACE("adaptive: prepare task %d\n", t->pid); if (t->state == TASK_STOPPED) { __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); t->rt_param.scheduled_on = NO_CPU; t->rt_param.linked_on = NO_CPU; if (t->rt_param.no_service_levels) { t->rt_param.predictor_state.estimate = get_sl(t, 0).weight; } else t->rt_param.predictor_state.estimate = _frac(get_exec_cost(t), get_rt_period(t)); TRACE_TASK(t, "est_weight=" _FP_ "\n", get_est_weight(t)); if (get_rt_mode() == MODE_RT_RUN) /* The action is already on. * Prepare immediate release */ edf_release_now(t); /* The task should be running in the queue, otherwise signal * code will try to wake it up with fatal consequences. */ t->state = TASK_RUNNING; queue_lock_irqsave(&adaptive_lock, flags); total_weight = _add(total_weight, get_est_weight(t)); requeue(t); queue_unlock_irqrestore(&adaptive_lock, flags); return 0; } else return -EPERM; } static void adaptive_wake_up_task(struct task_struct *task) { unsigned long flags; /* We must determine whether task should go into the release * queue or into the ready queue. It may enter the ready queue * if it has credit left in its time slice and has not yet reached * its deadline. If it is now passed its deadline we assume this the * arrival of a new sporadic job and thus put it in the ready queue * anyway.If it has zero budget and the next release is in the future * it has to go to the release queue. */ TRACE("adaptive: %d unsuspends\n", task->pid); task->state = TASK_RUNNING; if (is_tardy(task)) { /* new sporadic release */ edf_release_now(task); sched_trace_job_release(task); } else if (task->time_slice) /* came back in time before deadline */ set_rt_flags(task, RT_F_RUNNING); queue_lock_irqsave(&adaptive_lock, flags); total_weight = _add(total_weight, get_est_weight(task)); adaptive_job_arrival(task); queue_unlock_irqrestore(&adaptive_lock, flags); } static void adaptive_task_blocks(struct task_struct *t) { unsigned long flags; /* unlink if necessary */ queue_lock_irqsave(&adaptive_lock, flags); total_weight = _sub(total_weight, get_est_weight(t)); unlink(t); queue_unlock_irqrestore(&adaptive_lock, flags); BUG_ON(!is_realtime(t)); TRACE("task %d suspends\n", t->pid); BUG_ON(t->rt_list.next != LIST_POISON1); BUG_ON(t->rt_list.prev != LIST_POISON2); } /* When _tear_down is called, the task should not be in any queue any more * as it must have blocked first. We don't have any internal state for the task, * it is all in the task_struct. */ static long adaptive_tear_down(struct task_struct * t) { BUG_ON(!is_realtime(t)); TRACE_TASK(t, "RIP\n"); BUG_ON(t->array); BUG_ON(t->rt_list.next != LIST_POISON1); BUG_ON(t->rt_list.prev != LIST_POISON2); return 0; } static int adaptive_mode_change(int new_mode) { unsigned long flags; int cpu; cpu_entry_t *entry; struct task_struct* t; struct list_head* pos; if (new_mode == MODE_RT_RUN) { queue_lock_irqsave(&adaptive_lock, flags); system_capacity = FP(0); for_each_online_cpu(cpu) system_capacity = _add(system_capacity, FP(1)); __rerelease_all(&adaptive, edf_release_at); total_weight = FP(0); list_for_each(pos, &adaptive.release_queue) { t = list_entry(pos, struct task_struct, rt_list); total_weight = _add(total_weight, get_est_weight(t)); } TRACE("adaptive: total weight: " _FP_ " (at mode change)\n", total_weight); /* get old cruft out of the way in case we reenter real-time * mode for a second time */ while (!list_empty(&adaptive_cpu_queue)) list_del(adaptive_cpu_queue.next); /* reinitialize */ for_each_online_cpu(cpu) { entry = &per_cpu(adaptive_cpu_entries, cpu); atomic_set(&entry->will_schedule, 0); entry->linked = NULL; entry->scheduled = NULL; list_add(&entry->list, &adaptive_cpu_queue); } adaptive_optimize(); queue_unlock_irqrestore(&adaptive_lock, flags); } return 0; } typedef enum { ADAPTIVE_SET_MIN_OPT_SEP = 1 } adaptive_cmds_t; static int adaptive_setup(int cmd, void __user *up) { unsigned int error = -EINVAL; unsigned int val; if (copy_from_user(&val, up, sizeof(unsigned int))) { error = -EFAULT; goto out; } switch (cmd) { case ADAPTIVE_SET_MIN_OPT_SEP: optimizer_min_invocation_sep = val; TRACE("adaptive: min opt sep set to %d\n", optimizer_min_invocation_sep); return 0; break; } out: return error; } /* Plugin object */ static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { .ready_to_use = 0 }; /* * Plugin initialization code. */ #define INIT_SCHED_PLUGIN (struct sched_plugin){ \ .plugin_name = "ADAPTIVE", \ .ready_to_use = 1, \ .scheduler_tick = adaptive_scheduler_tick, \ .prepare_task = adaptive_prepare_task, \ .sleep_next_period = edf_sleep_next_period, \ .tear_down = adaptive_tear_down, \ .schedule = adaptive_schedule, \ .finish_switch = adaptive_finish_switch, \ .mode_change = adaptive_mode_change, \ .wake_up_task = adaptive_wake_up_task, \ .task_blocks = adaptive_task_blocks, \ .scheduler_setup = adaptive_setup \ } sched_plugin_t *__init init_adaptive_plugin(void) { int cpu; cpu_entry_t *entry; /* magic values given in the paper */ fc_a = _frac( 102, 1000); fc_b = _frac( 303, 1000); optimizer_period = 1000; optimizer_min_invocation_sep = 50; task_error_threshold = _frac(1, 2); if (!s_plugin.ready_to_use) { /* initialize CPU state */ for (cpu = 0; cpu < NR_CPUS; cpu++) { entry = &per_cpu(adaptive_cpu_entries, cpu); atomic_set(&entry->will_schedule, 0); entry->linked = NULL; entry->scheduled = NULL; entry->cpu = cpu; } queue_lock_init(&adaptive_lock); edf_domain_init(&adaptive, NULL); s_plugin = INIT_SCHED_PLUGIN; } return &s_plugin; }