#include <linux/sched.h>
#include <linux/module.h>
#include <linux/uaccess.h>

#include <litmus/ftdev.h>
#include <litmus/litmus.h>
#include <litmus/trace.h>

/******************************************************************************/
/*                          Allocation                                        */
/******************************************************************************/

static struct ftdev cpu_overhead_dev;
static struct ftdev msg_overhead_dev;

#define cpu_trace_ts_buf(cpu) cpu_overhead_dev.minor[(cpu)].buf
#define msg_trace_ts_buf(cpu) msg_overhead_dev.minor[(cpu)].buf

DEFINE_PER_CPU(atomic_t, irq_fired_count;)
DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, cpu_irq_fired_count);

static DEFINE_PER_CPU(unsigned int, cpu_ts_seq_no);
static DEFINE_PER_CPU(unsigned int, msg_ts_seq_no);

static int64_t cycle_offset[NR_CPUS][NR_CPUS];

void ft_irq_fired(void)
{
	/* Only called with preemptions disabled.  */
	atomic_inc(this_cpu_ptr(&irq_fired_count));
	atomic_inc(this_cpu_ptr(&cpu_irq_fired_count));

	if (has_control_page(current))
		get_control_page(current)->irq_count++;
}

static inline void clear_irq_fired(void)
{
	atomic_set(raw_cpu_ptr(&irq_fired_count), 0);
}

static inline unsigned int get_and_clear_irq_fired(void)
{
	/* This is potentially not atomic  since we might migrate if
	 * preemptions are not disabled. As a tradeoff between
	 * accuracy and tracing overheads, this seems acceptable.
	 * If it proves to be a problem, then one could add a callback
	 * from the migration code to invalidate irq_fired_count.
	 */
	return atomic_xchg(raw_cpu_ptr(&irq_fired_count), 0);
}

static inline unsigned int get_and_clear_irq_fired_for_cpu(int cpu)
{
	return atomic_xchg(&per_cpu(irq_fired_count, cpu), 0);
}

static inline void cpu_clear_irq_fired(void)
{
	atomic_set(raw_cpu_ptr(&cpu_irq_fired_count), 0);
}

static inline unsigned int cpu_get_and_clear_irq_fired(void)
{
	return atomic_xchg(raw_cpu_ptr(&cpu_irq_fired_count), 0);
}

static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
{
	/* Store how many interrupts occurred. */
	ts->irq_count = irq_count;
	/* Extra flag because ts->irq_count overflows quickly. */
	ts->irq_flag  = irq_count > 0;
}

#define NO_IRQ_COUNT 0
#define LOCAL_IRQ_COUNT 1
#define REMOTE_IRQ_COUNT 2

#define DO_NOT_RECORD_TIMESTAMP 0
#define RECORD_LOCAL_TIMESTAMP 1
#define RECORD_OFFSET_TIMESTAMP 2

static inline void __write_record(
	uint8_t event,
	uint8_t type,
	uint16_t pid_fragment,
	unsigned int irq_count,
	int record_irq,
	int hide_irq,
	uint64_t timestamp,
	int record_timestamp,

	int only_single_writer,
	int is_cpu_timestamp,
	int local_cpu,
	uint8_t other_cpu)
{
	unsigned long flags;
	unsigned int seq_no;
	struct timestamp *ts;
	int cpu;
	struct ft_buffer* buf;

	/* Avoid preemptions while recording the timestamp. This reduces the
	 * number of "out of order" timestamps in the stream and makes
	 * post-processing easier. */

	local_irq_save(flags);

	if (local_cpu)
		cpu = smp_processor_id();
	else
		cpu = other_cpu;

	/* resolved during function inlining */
	if (is_cpu_timestamp) {
		seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
		buf = cpu_trace_ts_buf(cpu);
	} else {
		seq_no = fetch_and_inc((int *) &per_cpu(msg_ts_seq_no, cpu));
		buf = msg_trace_ts_buf(cpu);
	}

	/* If buf is non-NULL here, then the buffer cannot be deallocated until
	 * we turn interrupts on again. This is because free_timestamp_buffer()
	 * indirectly causes TLB invalidations due to modifications of the
	 * kernel address space, namely via vfree() in free_ft_buffer(), which
	 * cannot be processed until we turn on interrupts again.
	 */

	if (buf &&
	    (only_single_writer /* resolved during function inlining */
	     ? ft_buffer_start_single_write(buf, (void**)  &ts)
	     : ft_buffer_start_write(buf, (void**) &ts))) {
		ts->event     = event;
		ts->seq_no    = seq_no;

		ts->task_type = type;
		ts->pid	      = pid_fragment;

		ts->cpu       = cpu;

		if (record_irq) {
			if (local_cpu)
				irq_count = cpu_get_and_clear_irq_fired();
			else
				irq_count = get_and_clear_irq_fired_for_cpu(cpu);
		}

		save_irq_flags(ts, irq_count - hide_irq);

		if (record_timestamp)
			timestamp = ft_timestamp();
		if (record_timestamp == RECORD_OFFSET_TIMESTAMP)
			timestamp += cycle_offset[smp_processor_id()][cpu];

		ts->timestamp = timestamp;
		ft_buffer_finish_write(buf, ts);
	}

	local_irq_restore(flags);
}


static inline void write_cpu_timestamp(
	uint8_t event,
	uint8_t type,
	uint16_t pid_fragment,
	unsigned int irq_count,
	int record_irq,
	int hide_irq,
	uint64_t timestamp,
	int record_timestamp)
{
	__write_record(event, type,
		       pid_fragment,
		       irq_count, record_irq, hide_irq,
		       timestamp, record_timestamp,
		       1 /* only_single_writer */,
		       1 /* is_cpu_timestamp */,
		       1 /* local_cpu */,
		       0xff /* other_cpu */);
}

static inline void save_msg_timestamp(
	uint8_t event,
	int hide_irq)
{
	struct task_struct *t  = current;
	__write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
		       t->pid,
		       0, LOCAL_IRQ_COUNT, hide_irq,
		       0, RECORD_LOCAL_TIMESTAMP,
		       0 /* only_single_writer */,
		       0 /* is_cpu_timestamp */,
		       1 /* local_cpu */,
		       0xff /* other_cpu */);
}

static inline void save_remote_msg_timestamp(
	uint8_t event,
	uint8_t remote_cpu)
{
	struct task_struct *t  = current;
	__write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
		       t->pid,
		       0, REMOTE_IRQ_COUNT, 0,
		       0, RECORD_OFFSET_TIMESTAMP,
		       0 /* only_single_writer */,
		       0 /* is_cpu_timestamp */,
		       0 /* local_cpu */,
		       remote_cpu);
}

feather_callback void save_cpu_timestamp_def(unsigned long event,
					     unsigned long type)
{
	write_cpu_timestamp(event, type,
			    current->pid,
			    0, LOCAL_IRQ_COUNT, 0,
			    0, RECORD_LOCAL_TIMESTAMP);
}

feather_callback void save_cpu_timestamp_task(unsigned long event,
					      unsigned long t_ptr)
{
	struct task_struct *t = (struct task_struct *) t_ptr;
	int rt = is_realtime(t);

	write_cpu_timestamp(event, rt ? TSK_RT : TSK_BE,
			    t->pid,
			    0, LOCAL_IRQ_COUNT, 0,
			    0, RECORD_LOCAL_TIMESTAMP);
}

/* fake timestamp to user-reported time */
feather_callback void save_cpu_timestamp_time(unsigned long event,
			 unsigned long ptr)
{
	uint64_t* time = (uint64_t*) ptr;

	write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
			    current->pid,
			    0, LOCAL_IRQ_COUNT, 0,
			    *time, DO_NOT_RECORD_TIMESTAMP);
}

/* Record user-reported IRQ count */
feather_callback void save_cpu_timestamp_irq(unsigned long event,
			unsigned long irq_counter_ptr)
{
	uint64_t* irqs = (uint64_t*) irq_counter_ptr;

	write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
			    current->pid,
			    *irqs, NO_IRQ_COUNT, 0,
			    0, RECORD_LOCAL_TIMESTAMP);
}

feather_callback void save_cpu_task_latency(unsigned long event,
					    unsigned long when_ptr)
{
	lt_t now = litmus_clock();
	lt_t *when = (lt_t*) when_ptr;

	write_cpu_timestamp(event, TSK_RT,
			    0,
			    0, LOCAL_IRQ_COUNT, 0,
			    now - *when, DO_NOT_RECORD_TIMESTAMP);
}

feather_callback void msg_sent(unsigned long event, unsigned long to)
{
	save_remote_msg_timestamp(event, to);
}

/* Suppresses one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
 * called from within an interrupt that is expected. */
feather_callback void msg_received(unsigned long event)
{
	save_msg_timestamp(event, 1);
}

static void __add_timestamp_user(struct timestamp *pre_recorded)
{
	unsigned long flags;
	unsigned int seq_no;
	struct timestamp *ts;
	struct ft_buffer* buf;
	int cpu;

	local_irq_save(flags);

	cpu = smp_processor_id();
	buf = cpu_trace_ts_buf(cpu);

	seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
	if (buf && ft_buffer_start_single_write(buf, (void**)  &ts)) {
		*ts = *pre_recorded;
		ts->seq_no = seq_no;
		ts->cpu	   = raw_smp_processor_id();
	        save_irq_flags(ts, get_and_clear_irq_fired());
		ft_buffer_finish_write(buf, ts);
	}

	local_irq_restore(flags);
}

/******************************************************************************/
/*                        DEVICE FILE DRIVER                                  */
/******************************************************************************/

struct calibrate_info {
	atomic_t ready;

	uint64_t cycle_count;
};

static void calibrate_helper(void *_info)
{
	struct calibrate_info *info = _info;
	/* check in with master */
	atomic_inc(&info->ready);

	/* wait for master to signal start */
	while (atomic_read(&info->ready))
		cpu_relax();

	/* report time stamp */
	info->cycle_count = ft_timestamp();

	/* tell master that we are done */
	atomic_inc(&info->ready);
}


static int64_t calibrate_cpu(int cpu)
{
	uint64_t cycles;
	struct calibrate_info info;
	unsigned long flags;
	int64_t  delta;

	atomic_set(&info.ready, 0);
	info.cycle_count = 0;
	smp_wmb();

	smp_call_function_single(cpu, calibrate_helper, &info, 0);

	/* wait for helper to become active */
	while (!atomic_read(&info.ready))
		cpu_relax();

	/* avoid interrupt interference */
	local_irq_save(flags);

	/* take measurement */
	atomic_set(&info.ready, 0);
	smp_wmb();
	cycles = ft_timestamp();

	/* wait for helper reading */
	while (!atomic_read(&info.ready))
		cpu_relax();

	/* positive offset: the other guy is ahead of us */
	delta  = (int64_t) info.cycle_count;
	delta -= (int64_t) cycles;

	local_irq_restore(flags);

	return delta;
}

#define NUM_SAMPLES 10

static long calibrate_tsc_offsets(struct ftdev* ftdev, unsigned int idx,
				  unsigned long uarg)
{
	int cpu, self, i;
	int64_t delta, sample;

	preempt_disable();
	self = smp_processor_id();

	if (uarg)
		printk(KERN_INFO "Feather-Trace: determining TSC offsets for P%d\n", self);

	for_each_online_cpu(cpu)
		if (cpu != self) {
			delta = calibrate_cpu(cpu);
			for (i = 1; i < NUM_SAMPLES; i++) {
			        sample = calibrate_cpu(cpu);
				delta = sample < delta ? sample : delta;
			}

			cycle_offset[self][cpu] = delta;

			if (uarg)
				printk(KERN_INFO "Feather-Trace: TSC offset for P%d->P%d is %lld cycles.\n",
				       self, cpu, cycle_offset[self][cpu]);
		}

	preempt_enable();
	return 0;
}

#define NO_TIMESTAMPS (2 << CONFIG_SCHED_OVERHEAD_TRACE_SHIFT)

static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
{
	unsigned int count = NO_TIMESTAMPS;

	/* An overhead-tracing timestamp should be exactly 16 bytes long. */
	BUILD_BUG_ON(sizeof(struct timestamp) != 16);

	while (count && !ftdev->minor[idx].buf) {
		printk("time stamp buffer: trying to allocate %u time stamps for minor=%u.\n", count, idx);
		ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
		count /= 2;
	}
	return ftdev->minor[idx].buf ? 0 : -ENOMEM;
}

static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
{
	ftdev->minor[idx].buf = NULL;
	/* Make sure all cores have actually seen buf == NULL before
	 * yanking out the mappings from underneath them. */
	smp_wmb();
	free_ft_buffer(ftdev->minor[idx].buf);
}

static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
					 const char __user *from)
{
	ssize_t consumed = 0;
	struct timestamp ts;

	/* don't give us partial timestamps */
	if (len % sizeof(ts))
		return -EINVAL;

	while (len >= sizeof(ts)) {
		if (copy_from_user(&ts, from, sizeof(ts))) {
			consumed = -EFAULT;
			goto out;
		}
		len  -= sizeof(ts);
		from += sizeof(ts);
		consumed += sizeof(ts);

		/* Note: this always adds to the buffer of the CPU-local
		 * device, not necessarily to the device that the system call
		 * was invoked on. This is admittedly a bit ugly, but requiring
		 * tasks to only write to the appropriate device would make
		 * tracing from userspace under global and clustered scheduling
		 * exceedingly difficult. Writing to remote buffers would
		 * require to not use ft_buffer_start_single_write(), which we
		 * want to do to reduce the number of atomic ops in the common
		 * case (which is the recording of CPU-local scheduling
		 * overheads).
		 */
		__add_timestamp_user(&ts);
	}

out:
	return consumed;
}

static int __init init_cpu_ft_overhead_trace(void)
{
	int err, cpu;

	printk("Initializing Feather-Trace per-cpu overhead tracing device.\n");
	err = ftdev_init(&cpu_overhead_dev, THIS_MODULE,
			 num_online_cpus(), "ft_cpu_trace");
	if (err)
		goto err_out;

	cpu_overhead_dev.alloc = alloc_timestamp_buffer;
	cpu_overhead_dev.free  = free_timestamp_buffer;
	cpu_overhead_dev.write = write_timestamp_from_user;

	err = register_ftdev(&cpu_overhead_dev);
	if (err)
		goto err_dealloc;

	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
		per_cpu(cpu_ts_seq_no, cpu) = 0;
	}

	return 0;

err_dealloc:
	ftdev_exit(&cpu_overhead_dev);
err_out:
	printk(KERN_WARNING "Could not register per-cpu ft_trace device.\n");
	return err;
}

static int __init init_msg_ft_overhead_trace(void)
{
	int err, cpu;

	printk("Initializing Feather-Trace per-cpu message overhead tracing device.\n");
	err = ftdev_init(&msg_overhead_dev, THIS_MODULE,
			 num_online_cpus(), "ft_msg_trace");
	if (err)
		goto err_out;

	msg_overhead_dev.alloc = alloc_timestamp_buffer;
	msg_overhead_dev.free  = free_timestamp_buffer;
	msg_overhead_dev.calibrate = calibrate_tsc_offsets;

	err = register_ftdev(&msg_overhead_dev);
	if (err)
		goto err_dealloc;

	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
		per_cpu(msg_ts_seq_no, cpu) = 0;
	}

	return 0;

err_dealloc:
	ftdev_exit(&msg_overhead_dev);
err_out:
	printk(KERN_WARNING "Could not register message ft_trace device.\n");
	return err;
}


static int __init init_ft_overhead_trace(void)
{
	int err, i, j;

	for (i = 0; i < NR_CPUS; i++)
		for (j = 0; j < NR_CPUS; j++)
			cycle_offset[i][j] = 0;

	err = init_cpu_ft_overhead_trace();
	if (err)
		return err;

	err = init_msg_ft_overhead_trace();
	if (err)
		ftdev_exit(&cpu_overhead_dev);
		return err;

	return 0;
}

static void __exit exit_ft_overhead_trace(void)
{
	ftdev_exit(&cpu_overhead_dev);
	ftdev_exit(&msg_overhead_dev);
}

module_init(init_ft_overhead_trace);
module_exit(exit_ft_overhead_trace);