/**
 * litmus/sched_mc_ce.c
 *
 * The Cyclic Executive (CE) scheduler used by the mixed criticality scheduling
 * algorithm.
 */

#include <asm/atomic.h>
#include <asm/uaccess.h>

#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/proc_fs.h>

#include <litmus/litmus.h>
#include <litmus/sched_plugin.h>
#include <litmus/rt_domain.h>
#include <litmus/rt_param.h>
#include <litmus/litmus_proc.h>
#include <litmus/sched_trace.h>
#include <litmus/jobs.h>
#include <litmus/sched_mc.h>

static struct sched_plugin mc_ce_plugin __cacheline_aligned_in_smp;

#define is_active_plugin() (litmus == &mc_ce_plugin)
#define get_ce_data(dom_data_ref) (dom_data_ref->domain.data)

static atomic_t start_time_set = ATOMIC_INIT(-1);
static atomic64_t start_time = ATOMIC64_INIT(0);
static struct proc_dir_entry *mc_ce_dir = NULL, *ce_file = NULL;


DEFINE_PER_CPU(domain_data_t, mc_ce_doms);
DEFINE_PER_CPU(rt_domain_t, mc_ce_rts);
DEFINE_PER_CPU(struct ce_dom_data, _mc_ce_dom_data);

/* Return the address of the domain_t for this CPU, used by the
 * mixed-criticality plugin. */
domain_data_t *ce_domain_for(int cpu)
{
	return &per_cpu(mc_ce_doms, cpu);
}


/*
 * Get the offset into the cycle taking the start time into account.
 */
static inline lt_t get_cycle_offset(const lt_t when, const lt_t cycle_time)
{
	long long st = atomic64_read(&start_time);
	lt_t offset = (when - st) % cycle_time;
	TRACE("when: %llu  cycle_time: %llu start_time: %lld  offset %llu\n",
			when, cycle_time, st, offset);
	return offset;
}

/*
 * The user land job completion call will set the RT_F_SLEEP flag and then
 * call schedule. This function is used when schedule sleeps a task.
 *
 * Do not call prepare_for_next_period on Level-A tasks!
 */
static void mc_ce_job_completion(struct task_struct *ts)
{
	const domain_data_t *dom_data = &per_cpu(mc_ce_doms, smp_processor_id());
	const struct ce_dom_data *ce_data = get_ce_data(dom_data);
	const int idx = tsk_mc_data(ts)->mc_task.lvl_a_id;
	const struct ce_dom_pid_entry *pid_entry =
		&ce_data->pid_entries[idx];
	int just_finished;

	TRACE_TASK(ts, "completed\n");

	sched_trace_task_completion(ts, 0);
	/* post-increment is important here */
	just_finished = (tsk_rt(ts)->job_params.job_no)++;

	/* Job completes in expected window: everything is normal.
	 * Job completes in an earlier window: BUG(), that's wrong.
	 * Job completes in a later window: The job is behind.
	 */
	if (just_finished < pid_entry->expected_job) {
		/* this job is already released because it's running behind */
		set_rt_flags(ts, RT_F_RUNNING);
		TRACE_TASK(ts, "appears behind: the expected job is %d but "
				"job %d just completed\n",
				pid_entry->expected_job, just_finished);
	} else if (pid_entry->expected_job < just_finished) {
		printk(KERN_CRIT "job %d completed in expected job %d which "
				"seems too early\n", just_finished,
				pid_entry->expected_job);
		BUG();
	}
}


/*
 * Return the index into the PID entries table of what to schedule next.
 * Don't call if the table is empty. Assumes the caller has the domain lock.
 * The offset parameter is the offset into the cycle.
 *
 * TODO Currently O(n) in the number of tasks on the CPU. Binary search?
 */
static int mc_ce_schedule_at(const domain_t *dom, lt_t offset)
{
	const struct ce_dom_data *ce_data = dom->data;
	const struct ce_dom_pid_entry *pid_entry = NULL;
	int i;

	BUG_ON(ce_data->cycle_time < 1);
	BUG_ON(ce_data->num_pid_entries < 1);

	for (i = 0; i < ce_data->num_pid_entries; ++i) {
		pid_entry = &ce_data->pid_entries[i];
		if (offset < pid_entry->acc_time) {
			/* found task to schedule in this window */
			break;
		}
	}
	/* can only happen if cycle_time is not right */
	BUG_ON(pid_entry->acc_time > ce_data->cycle_time);
	TRACE("schedule at returned task %d for CPU %d\n", i, ce_data->cpu);
	return i;
}

static struct task_struct *mc_ce_schedule(struct task_struct *prev)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, smp_processor_id());
	domain_t *dom = &dom_data->domain;
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	struct task_struct *next = NULL;
	int exists, sleep, should_sched_exists, should_sched_blocked,
	    should_sched_asleep;

	raw_spin_lock(dom->lock);

	/* sanity checking */
	BUG_ON(ce_data->scheduled && ce_data->scheduled != prev);
	BUG_ON(ce_data->scheduled && !is_realtime(prev));
	BUG_ON(is_realtime(prev) && !ce_data->scheduled);

	exists = NULL != ce_data->scheduled;
	sleep = exists && RT_F_SLEEP == get_rt_flags(ce_data->scheduled);

	TRACE("exists: %d, sleep: %d\n", exists, sleep);

	if (sleep)
		mc_ce_job_completion(ce_data->scheduled);

	/* these checks must go after the call to mc_ce_job_completion in case
	 * a late task needs to be scheduled again right away and its the only
	 * task on a core
	 */
	should_sched_exists = NULL != ce_data->should_schedule;
	should_sched_blocked = should_sched_exists &&
		!is_running(ce_data->should_schedule);
	should_sched_asleep = should_sched_exists &&
		RT_F_SLEEP == get_rt_flags(ce_data->should_schedule);

	TRACE("should_sched_exists: %d, should_sched_blocked: %d, "
			"should_sched_asleep: %d\n", should_sched_exists,
			should_sched_blocked, should_sched_asleep);

	if (should_sched_exists && !should_sched_blocked &&
			!should_sched_asleep) {
		/*
		 * schedule the task that should be executing in the cyclic
		 * schedule if it is not blocked and not sleeping
		 */
		next = ce_data->should_schedule;
	}
	sched_state_task_picked();
	raw_spin_unlock(dom->lock);
	return next;
}

static void mc_ce_finish_switch(struct task_struct *prev)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, smp_processor_id());
	struct ce_dom_data *ce_data = get_ce_data(dom_data);

	TRACE("finish switch\n");

	if (is_realtime(current) && CRIT_LEVEL_A == tsk_mc_crit(current))
		ce_data->scheduled = current;
	else
		ce_data->scheduled = NULL;
}

/*
 * Called for every local timer interrupt.
 * Linux calls this with interrupts disabled, AFAIK.
 */
static void mc_ce_tick(struct task_struct *ts)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, smp_processor_id());
	domain_t *dom = &dom_data->domain;
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	struct task_struct *should_schedule;

	if (is_realtime(ts) && CRIT_LEVEL_A == tsk_mc_crit(ts)) {
		raw_spin_lock(dom->lock);
		should_schedule = ce_data->should_schedule;
		raw_spin_unlock(dom->lock);

		if (!is_np(ts) && ts != should_schedule) {
			litmus_reschedule_local();
		} else if (is_user_np(ts)) {
			request_exit_np(ts);
		}
	}
}

/*
 * Admit task called to see if this task is permitted to enter the system.
 * Here we look up the task's PID structure and save it in the proper slot on
 * the CPU this task will run on.
 */
static long __mc_ce_admit_task(struct task_struct *ts)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, get_partition(ts));
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	struct mc_data *mcd = tsk_mc_data(ts);
	struct pid *pid = NULL;
	long retval = -EINVAL;
	const int lvl_a_id = mcd->mc_task.lvl_a_id;

	/* check the task has migrated to the right CPU (like in sched_cedf) */
	if (task_cpu(ts) != get_partition(ts)) {
		printk(KERN_INFO "litmus: %d admitted on CPU %d but want %d ",
				ts->pid, task_cpu(ts), get_partition(ts));
		goto out;
	}

	/* only level A tasks can be CE */
	if (!mcd || CRIT_LEVEL_A != tsk_mc_crit(ts)) {
		printk(KERN_INFO "litmus: non-MC or non level A task %d\n",
				ts->pid);
		goto out;
	}

	/* try and get the task's PID structure */
	pid = get_task_pid(ts, PIDTYPE_PID);
	if (IS_ERR_OR_NULL(pid)) {
		printk(KERN_INFO "litmus: couldn't get pid struct for %d\n",
				ts->pid);
		goto out;
	}

	if (lvl_a_id >= ce_data->num_pid_entries) {
		printk(KERN_INFO "litmus: level A id greater than expected "
				"number of tasks %d for %d cpu %d\n",
				ce_data->num_pid_entries, ts->pid,
				get_partition(ts));
		goto out_put_pid;
	}
	if (ce_data->pid_entries[lvl_a_id].pid) {
		printk(KERN_INFO "litmus: have saved pid info id: %d cpu: %d\n",
				lvl_a_id, get_partition(ts));
		goto out_put_pid;
	}
	if (get_exec_cost(ts) >= ce_data->pid_entries[lvl_a_id].budget) {
		printk(KERN_INFO "litmus: execution cost %llu is larger than "
				"the budget %llu\n",
				get_exec_cost(ts),
				ce_data->pid_entries[lvl_a_id].budget);
		goto out_put_pid;
	}
	ce_data->pid_entries[lvl_a_id].pid = pid;
	retval = 0;
	/* don't call put_pid if we are successful */
	goto out;

out_put_pid:
	put_pid(pid);
out:
	return retval;
}

static long mc_ce_admit_task(struct task_struct *ts)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, get_partition(ts));
	domain_t *dom = &dom_data->domain;
	unsigned long flags, retval;
	raw_spin_lock_irqsave(dom->lock, flags);
	retval = __mc_ce_admit_task(ts);
	raw_spin_unlock_irqrestore(dom->lock, flags);
	return retval;
}

/*
 * Called to set up a new real-time task (after the admit_task callback).
 * At this point the task's struct PID is already hooked up on the destination
 * CPU. The task may already be running.
 */
static void mc_ce_task_new(struct task_struct *ts, int on_rq, int running)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, task_cpu(ts));
	domain_t *dom = &dom_data->domain;
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	struct pid *pid_should_be_running;
	struct ce_dom_pid_entry *pid_entry;
	unsigned long flags;
	int idx, should_be_running;
	lt_t offset;

	/* have to call mc_ce_schedule_at because the task only gets a PID
	 * entry after calling admit_task */

	raw_spin_lock_irqsave(dom->lock, flags);
	pid_entry = &ce_data->pid_entries[tsk_mc_data(ts)->mc_task.lvl_a_id];
	/* initialize some task state */
	set_rt_flags(ts, RT_F_RUNNING);
	tsk_rt(ts)->job_params.job_no = 0;

	offset = get_cycle_offset(litmus_clock(), ce_data->cycle_time);
	idx = mc_ce_schedule_at(dom, offset);
	pid_should_be_running = ce_data->pid_entries[idx].pid;
	rcu_read_lock();
	should_be_running = (ts == pid_task(pid_should_be_running, PIDTYPE_PID));
	rcu_read_unlock();
	if (running) {
		/* admit task checks that the task is not on the wrong CPU */
		BUG_ON(task_cpu(ts) != get_partition(ts));
		BUG_ON(ce_data->scheduled);
		ce_data->scheduled = ts;

		if (should_be_running)
			ce_data->should_schedule = ts;
		else
			preempt_if_preemptable(ce_data->scheduled, ce_data->cpu);
	} else if (!running && should_be_running) {
		ce_data->should_schedule = ts;
		preempt_if_preemptable(ce_data->scheduled, ce_data->cpu);
	}
	raw_spin_unlock_irqrestore(dom->lock, flags);
}

/*
 * Called to re-introduce a task after blocking.
 * Can potentailly be called multiple times.
 */
static void mc_ce_task_wake_up(struct task_struct *ts)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, smp_processor_id());
	domain_t *dom = &dom_data->domain;
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	unsigned long flags;

	TRACE_TASK(ts, "wake up\n");

	raw_spin_lock_irqsave(dom->lock, flags);
	if (ts == ce_data->should_schedule && ts != ce_data->scheduled)
		preempt_if_preemptable(ts, ce_data->cpu);
	raw_spin_unlock_irqrestore(dom->lock, flags);
}

/*
 * Called to notify the plugin of a blocking real-time tasks. Only called for
 * real-time tasks and before schedule is called.
 */
static void mc_ce_task_block(struct task_struct *ts)
{
	/* nothing to do because it will be taken care of in schedule */
	TRACE_TASK(ts, "blocked\n");
}

/*
 * Called when a task switches from RT mode back to normal mode.
 */
void mc_ce_task_exit(struct task_struct *ts)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, get_partition(ts));
	domain_t *dom = &dom_data->domain;
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	unsigned long flags;
	struct pid *pid;
	const int lvl_a_id = tsk_mc_data(ts)->mc_task.lvl_a_id;

	TRACE_TASK(ts, "exited\n");

	BUG_ON(task_cpu(ts) != get_partition(ts));
	BUG_ON(CRIT_LEVEL_A != tsk_mc_crit(ts));
	BUG_ON(lvl_a_id >= ce_data->num_pid_entries);

	raw_spin_lock_irqsave(dom->lock, flags);
	pid = ce_data->pid_entries[lvl_a_id].pid;
	BUG_ON(!pid);
	put_pid(pid);
	ce_data->pid_entries[lvl_a_id].pid = NULL;
	if (ce_data->scheduled == ts)
		ce_data->scheduled = NULL;
	if (ce_data->should_schedule == ts)
		ce_data->should_schedule = NULL;
	raw_spin_unlock_irqrestore(dom->lock, flags);
}

/***********************************************************
 * Timer stuff
 **********************************************************/

void __mc_ce_timer_callback(struct hrtimer *timer)
{
	/* relative and absolute times for cycles */
	lt_t now, offset_rel, cycle_start_abs, next_timer_abs;
	struct task_struct *should_schedule;
	struct ce_dom_pid_entry *pid_entry;
	struct ce_dom_data *ce_data;
	domain_data_t *dom_data;
	domain_t *dom;
	int idx, budget_overrun;

	ce_data = container_of(timer, struct ce_dom_data, timer);
	dom_data = &per_cpu(mc_ce_doms, ce_data->cpu);
	dom = &dom_data->domain;

	/* Based off of the current time, figure out the offset into the cycle
	 * and the cycle's start time, and determine what should be scheduled.
	 */
	now = litmus_clock();
	offset_rel = get_cycle_offset(now, ce_data->cycle_time);
	cycle_start_abs = now - offset_rel;
	idx = mc_ce_schedule_at(dom, offset_rel);
	pid_entry = &ce_data->pid_entries[idx];
	/* set the timer to fire at the next cycle start */
	next_timer_abs = cycle_start_abs + pid_entry->acc_time;
	hrtimer_set_expires(timer, ns_to_ktime(next_timer_abs));

	TRACE("timer: now: %llu  offset_rel: %llu  cycle_start_abs: %llu  "
			"next_timer_abs: %llu\n", now, offset_rel,
			cycle_start_abs, next_timer_abs);

	/* get the task_struct (pid_task can accept a NULL) */
	rcu_read_lock();
	should_schedule = pid_task(pid_entry->pid, PIDTYPE_PID);
	rcu_read_unlock();
	ce_data->should_schedule = should_schedule;

	if (should_schedule && 0 == atomic_read(&start_time_set)) {
		/*
		 * If jobs are not overrunning their budgets, then this
		 * should not happen.
		 */
		pid_entry->expected_job++;
		budget_overrun = pid_entry->expected_job !=
			tsk_rt(should_schedule)->job_params.job_no;
		if (budget_overrun)
			TRACE_TASK(should_schedule, "timer expected job number: %d "
					"but current job: %d\n",
					pid_entry->expected_job,
					tsk_rt(should_schedule)->job_params.job_no);
	}

	if (ce_data->should_schedule) {
		tsk_rt(should_schedule)->job_params.deadline =
			cycle_start_abs + pid_entry->acc_time;
		tsk_rt(should_schedule)->job_params.release =
			tsk_rt(should_schedule)->job_params.deadline -
			pid_entry->budget;
		tsk_rt(should_schedule)->job_params.exec_time = 0;
		sched_trace_task_release(should_schedule);
		set_rt_flags(ce_data->should_schedule, RT_F_RUNNING);
	}
}

/*
 * What to do when a timer fires. The timer should only be armed if the number
 * of PID entries is positive.
 */
static enum hrtimer_restart mc_ce_timer_callback(struct hrtimer *timer)
{
	struct ce_dom_data *ce_data;
	unsigned long flags;
	domain_data_t *dom_data;
	domain_t *dom;

	ce_data = container_of(timer, struct ce_dom_data, timer);
	dom_data = &per_cpu(mc_ce_doms, ce_data->cpu);
	dom = &dom_data->domain;

	TRACE("timer callback on CPU %d (before lock)\n", ce_data->cpu);

	raw_spin_lock_irqsave(dom->lock, flags);
	__mc_ce_timer_callback(timer);

	if (ce_data->scheduled != ce_data->should_schedule)
		preempt_if_preemptable(ce_data->scheduled, ce_data->cpu);

	raw_spin_unlock_irqrestore(dom->lock, flags);

	return HRTIMER_RESTART;
}

/*
 * Cancel timers on all CPUs. Returns 1 if any were active.
 */
static int cancel_all_timers(void)
{
	struct ce_dom_data *ce_data;
	domain_data_t *dom_data;
	int cpu, ret = 0, cancel_res;

	TRACE("cancel all timers\n");

	for_each_online_cpu(cpu) {
		dom_data = &per_cpu(mc_ce_doms, cpu);
		ce_data = get_ce_data(dom_data);
		ce_data->should_schedule = NULL;
		cancel_res = hrtimer_cancel(&ce_data->timer);
		atomic_set(&ce_data->timer_info.state,
				HRTIMER_START_ON_INACTIVE);
		ret = ret || cancel_res; 
	}
	return ret;
}

/*
 * Arm all timers so that they start at the new value of start time.
 * Any CPU without CE PID entries won't have a timer armed.
 * All timers should be canceled before calling this.
 */
static void arm_all_timers(void)
{
	struct ce_dom_data *ce_data;
	domain_data_t *dom_data;
	int cpu, idx;
	const lt_t start = atomic64_read(&start_time);

	TRACE("arm all timers\n");

	for_each_online_cpu(cpu) {
		dom_data = &per_cpu(mc_ce_doms, cpu);
		ce_data = get_ce_data(dom_data);
		if (0 == ce_data->num_pid_entries)
			continue;
		for (idx = 0; idx < ce_data->num_pid_entries; idx++) {
			ce_data->pid_entries[idx].expected_job = -1;
		}
		TRACE("arming timer for CPU %d\n", cpu);
		hrtimer_start_on(cpu, &ce_data->timer_info, &ce_data->timer,
				ns_to_ktime(start), HRTIMER_MODE_ABS_PINNED);
	}
}

/*
 * There are no real releases in the CE, but the task release syscall will
 * call this. We can re-set our notion of the CE period start to make
 * the schedule look pretty.
 */
void mc_ce_release_at(struct task_struct *ts, lt_t start)
{
	TRACE_TASK(ts, "release at\n");
	if (atomic_inc_and_test(&start_time_set)) {
		/* in this case, we won the race */
		cancel_all_timers();
		atomic64_set(&start_time, start);
		arm_all_timers();
	} else
		atomic_dec(&start_time_set);
}

long mc_ce_activate_plugin(void)
{
	struct ce_dom_data *ce_data;
	domain_data_t *dom_data;
	int cpu;

	for_each_online_cpu(cpu) {
		dom_data = &per_cpu(mc_ce_doms, cpu);
		ce_data = get_ce_data(dom_data);
		ce_data->scheduled = NULL;
		ce_data->should_schedule = NULL;
	}

	atomic_set(&start_time_set, -1);
	atomic64_set(&start_time, litmus_clock());
	/* may not want to arm timers on activation, just after release */
	arm_all_timers();
	return 0;
}

static void clear_pid_entries(void)
{
	int cpu, entry;
	domain_data_t *dom_data;
	struct ce_dom_data *ce_data;

	for_each_online_cpu(cpu) {
		dom_data = &per_cpu(mc_ce_doms, cpu);
		ce_data = get_ce_data(dom_data);
		ce_data->num_pid_entries = 0;
		ce_data->cycle_time = 0;
		for (entry = 0; entry < CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS;
				++entry) {
			if (NULL != ce_data->pid_entries[entry].pid) {
				put_pid(ce_data->pid_entries[entry].pid);
				ce_data->pid_entries[entry].pid = NULL;
			}
			ce_data->pid_entries[entry].budget = 0;
			ce_data->pid_entries[entry].acc_time = 0;
			ce_data->pid_entries[entry].expected_job = -1;
		}
	}
}

long mc_ce_deactivate_plugin(void)
{
	cancel_all_timers();
	return 0;
}

/*	Plugin object	*/
static struct sched_plugin mc_ce_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "MC-CE",
	.admit_task		= mc_ce_admit_task,
	.task_new		= mc_ce_task_new,
	.complete_job		= complete_job,
	.release_at		= mc_ce_release_at,
	.task_exit		= mc_ce_task_exit,
	.schedule		= mc_ce_schedule,
	.finish_switch		= mc_ce_finish_switch,
	.tick			= mc_ce_tick,
	.task_wake_up		= mc_ce_task_wake_up,
	.task_block		= mc_ce_task_block,
	.activate_plugin	= mc_ce_activate_plugin,
	.deactivate_plugin	= mc_ce_deactivate_plugin,
};

static int setup_proc(void);
static int __init init_sched_mc_ce(void)
{
	struct ce_dom_data *ce_data;
	domain_data_t *dom_data;
	domain_t *dom;
	rt_domain_t *rt;
	int cpu, err;

	for_each_online_cpu(cpu) {
		dom_data = &per_cpu(mc_ce_doms, cpu);
		dom = &dom_data->domain;
		rt = &per_cpu(mc_ce_rts, cpu);
		pd_domain_init(dom, rt, NULL, NULL, NULL, NULL, NULL);
		dom->data = &per_cpu(_mc_ce_dom_data, cpu);
		ce_data = get_ce_data(dom_data);
		hrtimer_init(&ce_data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
		hrtimer_start_on_info_init(&ce_data->timer_info);
		ce_data->cpu = cpu;
		ce_data->timer.function = mc_ce_timer_callback;
	}
	clear_pid_entries();
	err = setup_proc();
	if (!err)
		err = register_sched_plugin(&mc_ce_plugin);
	return err;
}

#define BUF_SIZE PAGE_SIZE
static int write_into_proc(char *proc_buf, const int proc_size, char *fmt, ...)
{
	static char buf[BUF_SIZE];
	int n;
	va_list args;

	/* When writing to procfs, we don't care about the trailing null that
	 * is not included in the count returned by vscnprintf.
	 */
	va_start(args, fmt);
	n = vsnprintf(buf, BUF_SIZE, fmt, args);
	va_end(args);
	if (BUF_SIZE <= n || proc_size <= n) {
		/* too big for formatting buffer or proc (less null byte) */
		n = -EINVAL;
		goto out;
	}
	memcpy(proc_buf, buf, n);
out:
	return n;
}
#undef BUF_SIZE

/*
 * Writes a PID entry to the procfs.
 *
 * @page buffer to write into.
 * @count bytes available in the buffer
 */
#define PID_SPACE 15
#define TASK_INFO_BUF (PID_SPACE + TASK_COMM_LEN)
static int write_pid_entry(char *page, const int count, const int cpu,
		const int task, struct ce_dom_pid_entry *pid_entry)
{
	static char task_info[TASK_INFO_BUF];
	struct task_struct *ts;
	int n = 0, err, ti_n;
	char *ti_b;

	if (pid_entry->pid) {
		rcu_read_lock();
		ts = pid_task(pid_entry->pid, PIDTYPE_PID);
		rcu_read_unlock();

		/* get some information about the task */
		if (ts) {
			ti_b = task_info;
			ti_n = snprintf(ti_b, PID_SPACE, "%d", ts->pid);
			if (PID_SPACE <= ti_n)
				ti_n = PID_SPACE - 1;
			ti_b += ti_n;
			*ti_b = ' '; /* nuke the null byte */
			ti_b++;
			get_task_comm(ti_b, ts);
		} else {
			strncpy(task_info, "pid_task() failed :(",
					TASK_INFO_BUF);
		}

	} else
		strncpy(task_info, "no", TASK_INFO_BUF);
	task_info[TASK_INFO_BUF - 1] = '\0'; /* just to be sure */

	err = write_into_proc(page + n, count - n, "# task: %s\n", task_info);
	if (err < 0) {
		n = -ENOSPC;
		goto out;
	}
	n += err;
	err = write_into_proc(page + n, count - n, "%d, %d, %llu\n",
			cpu, task, pid_entry->budget);
	if (err < 0) {
		n = -ENOSPC;
		goto out;
	}
	n += err;
out:
	return n;
}
#undef PID_SPACE
#undef TASK_INFO_BUF

/*
 * Called when the user-land reads from proc.
 */
static int proc_read_ce_file(char *page, char **start, off_t off, int count,
		int *eof, void *data)
{
	int n = 0, err, cpu, t;
	struct ce_dom_data *ce_data;
	domain_data_t *dom_data;

	if (off > 0) {
		printk(KERN_INFO "litmus: MC-CE called read with off > 0\n");
		goto out;
	}

	for_each_online_cpu(cpu) {
		dom_data = &per_cpu(mc_ce_doms, cpu);
		ce_data = get_ce_data(dom_data);
		for (t = 0; t < ce_data->num_pid_entries; ++t) {
			err = write_pid_entry(page + n, count - n,
					cpu, t, &ce_data->pid_entries[t]);
			if (err < 0) {
				n = -ENOSPC;
				goto out;
			}
			n += err;
		}
	}
out:
	*eof = 1;
	return n;
}

/*
 * Skip a commented line.
 */
static int skip_comment(const char *buf, const unsigned long max)
{
	unsigned long i = 0;
	const char *c = buf;
	if (0 == max || !c || *c != '#')
		return 0;
	++c; ++i;
	for (; i < max; ++i) {
		if (*c == '\n') {
			++c; ++i;
			break;
		}
		++c;
	}
	return i;
}

/* a budget of 5 milliseconds is probably reasonable */
#define BUDGET_THRESHOLD 5000000ULL
static int setup_pid_entry(const int cpu, const int task, const lt_t budget)
{
	domain_data_t *dom_data = &per_cpu(mc_ce_doms, cpu);
	struct ce_dom_data *ce_data = get_ce_data(dom_data);
	struct ce_dom_pid_entry *new_entry;
	int err = 0;

	/* check the inputs */
	if (cpu < 0 || NR_CPUS <= cpu || task < 0 ||
			CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS <= task ||
			budget < 1) {
		printk(KERN_INFO "litmus: bad cpu, task ID, or budget sent to "
				"MC-CE proc\n");
		err = -EINVAL;
		goto out;
	}
	/* check for small budgets */
	if (BUDGET_THRESHOLD > budget) {
		printk(KERN_CRIT "litmus: you gave a small budget for an "
				"MC-CE task; that might be an issue.\n");
	}
	/* check that we have space for a new entry */
	if (CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS <= ce_data->num_pid_entries) {
		printk(KERN_INFO "litmus: too many MC-CE tasks for cpu "
				"%d\n", cpu);
		err = -EINVAL;
		goto out;
	}
	/* add the new entry */
	new_entry = &ce_data->pid_entries[ce_data->num_pid_entries];
	BUG_ON(NULL != new_entry->pid);
	new_entry->budget = budget;
	new_entry->acc_time = ce_data->cycle_time + budget;
	/* update the domain entry */
	ce_data->cycle_time += budget;
	ce_data->num_pid_entries++;
out:
	return err;
}
#undef BUDGET_THRESHOLD

/*
 * Called when the user-land writes to proc.
 *
 * Error checking is quite minimal. Format is:
 * <cpu>, <process ID>, <budget>
 */
#define PROCFS_MAX_SIZE PAGE_SIZE
static int proc_write_ce_file(struct file *file, const char __user *buffer,
		unsigned long count, void *data)
{
	static char kbuf[PROCFS_MAX_SIZE];
	char *c = kbuf, *c_skipped;
	int cpu, task, cnt = 0, chars_read, converted, err;
	lt_t budget;

	if (is_active_plugin()) {
		printk(KERN_INFO "litmus: can't edit MC-CE proc when plugin "
				"active\n");
		cnt = -EINVAL;
		goto out;
	}

	if (count > PROCFS_MAX_SIZE) {
		printk(KERN_INFO "litmus: MC-CE procfs got too many bytes "
				"from user-space.\n");
		cnt = -EINVAL;
		goto out;
	}

	if (copy_from_user(kbuf, buffer, count)) {
		printk(KERN_INFO "litmus: couldn't copy from user %s\n",
				__FUNCTION__);
		cnt = -EFAULT;
		goto out;
	}
	clear_pid_entries();
	while (cnt < count) {
		c_skipped = skip_spaces(c);
		if (c_skipped != c) {
			chars_read = c_skipped - c;
			cnt += chars_read;
			c += chars_read;
			continue;
		}
		if (*c == '#') {
			chars_read = skip_comment(c, count - cnt);
			cnt += chars_read;
			c += chars_read;
			continue;
		}
		converted = sscanf(c, "%d, %d, %llu%n", &cpu, &task, &budget,
				&chars_read);
		if (3 != converted) {
			printk(KERN_INFO "litmus: MC-CE procfs expected three "
					"arguments, but got %d.\n", converted);
			cnt = -EINVAL;
			goto out;
		}
		cnt += chars_read;
		c += chars_read;
		err = setup_pid_entry(cpu, task, budget);
		if (err) {
			cnt = -EINVAL;
			goto out;
		}
	}
out:
	return cnt;
}
#undef PROCFS_MAX_SIZE

#define CE_FILE_PROC_NAME "ce_file"
static void tear_down_proc(void)
{
	if (ce_file)
		remove_proc_entry(CE_FILE_PROC_NAME, mc_ce_dir);
	if (mc_ce_dir)
		remove_plugin_proc_dir(&mc_ce_plugin);
}

static int setup_proc(void)
{
	int err;
	err = make_plugin_proc_dir(&mc_ce_plugin, &mc_ce_dir);
	if (err) {
		printk(KERN_ERR "could not create MC-CE procfs dir.\n");
		goto out;
	}
	ce_file = create_proc_entry(CE_FILE_PROC_NAME, 0644, mc_ce_dir);
	if (!ce_file) {
		printk(KERN_ERR "could not create MC-CE procfs file.\n");
		err = -EIO;
		goto out_remove_proc;
	}
	ce_file->read_proc = proc_read_ce_file;
	ce_file->write_proc = proc_write_ce_file;
	goto out;
out_remove_proc:
	tear_down_proc();
out:
	return err;
}
#undef CE_FILE_PROC_NAME

static void clean_sched_mc_ce(void)
{
	tear_down_proc();
}

module_init(init_sched_mc_ce);
module_exit(clean_sched_mc_ce);