Feather-Trace: add LITMUS^RT overhead tracing infrastructure

This patch adds the main infrastructure for tracing overheads in LITMUS^RT. It does not yet introduce any tracepoints into the kernel. Add new overhead tracing event IDs XCALL_{START,END} -- cross-CPU function calls SEND_XCALL_{START,END} -- cross-CPU function call latency SCHED_TIMER_{START,END} -- any additional, plugin-specific scheduling timer TIMER_LATENCY -- latency of plugin-specific timer Feather-Trace: Add macros for message-passing timestamps Add timestamp tracing macros that reverse on which cores the timestamps are recorded. This intended to reduce tracing contention if some cores receive a lot of messages. These macros are useful for plugins based on message-passing such as the (non-mainline) G-EDF-MP. Feather-Trace: convert timestamps to atomic_t - patch by Namhoon Kim (UNC)
author: Bjoern Brandenburg <bbb@mpi-sws.org> 2015-08-09 07:18:45 -0400
committer: Bjoern Brandenburg <bbb@mpi-sws.org> 2017-05-26 17:12:21 -0400
commit: d13b06b1c18565476b0849ecd032aaf6d326b877 (patch)
tree: 7b9b606c8c433428587e889f82dade7da89ba721 /litmus/trace.c
parent: d0ec47c1fd203fa87071fda00f8cf0a1826d6a34 (diff)
1 files changed, 562 insertions, 0 deletions
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 000000000000..64d63943f65d
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,562 @@
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <litmus/ftdev.h>
+#include <litmus/trace.h>
+/* dummy definition of is_realtime() */
+#define is_realtime(t) (0)
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+static struct ftdev cpu_overhead_dev;
+static struct ftdev msg_overhead_dev;
+#define cpu_trace_ts_buf(cpu) cpu_overhead_dev.minor[(cpu)].buf
+#define msg_trace_ts_buf(cpu) msg_overhead_dev.minor[(cpu)].buf
+DEFINE_PER_CPU(atomic_t, irq_fired_count;)
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, cpu_irq_fired_count);
+static DEFINE_PER_CPU(unsigned int, cpu_ts_seq_no);
+static DEFINE_PER_CPU(atomic_t, msg_ts_seq_no);
+static int64_t cycle_offset[NR_CPUS][NR_CPUS];
+void ft_irq_fired(void)
+{
+        /* Only called with preemptions disabled.  */
+        atomic_inc(this_cpu_ptr(&irq_fired_count));
+        atomic_inc(this_cpu_ptr(&cpu_irq_fired_count));
+}
+static inline void clear_irq_fired(void)
+{
+        atomic_set(raw_cpu_ptr(&irq_fired_count), 0);
+}
+static inline unsigned int get_and_clear_irq_fired(void)
+{
+        /* This is potentially not atomic  since we might migrate if
+         * preemptions are not disabled. As a tradeoff between
+         * accuracy and tracing overheads, this seems acceptable.
+         * If it proves to be a problem, then one could add a callback
+         * from the migration code to invalidate irq_fired_count.
+         */
+        return atomic_xchg(raw_cpu_ptr(&irq_fired_count), 0);
+}
+static inline unsigned int get_and_clear_irq_fired_for_cpu(int cpu)
+{
+        return atomic_xchg(&per_cpu(irq_fired_count, cpu), 0);
+}
+static inline void cpu_clear_irq_fired(void)
+{
+        atomic_set(raw_cpu_ptr(&cpu_irq_fired_count), 0);
+}
+static inline unsigned int cpu_get_and_clear_irq_fired(void)
+{
+        return atomic_xchg(raw_cpu_ptr(&cpu_irq_fired_count), 0);
+}
+static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
+{
+        /* Store how many interrupts occurred. */
+        ts->irq_count = irq_count;
+        /* Extra flag because ts->irq_count overflows quickly. */
+        ts->irq_flag  = irq_count > 0;
+}
+#define NO_IRQ_COUNT 0
+#define LOCAL_IRQ_COUNT 1
+#define REMOTE_IRQ_COUNT 2
+#define DO_NOT_RECORD_TIMESTAMP 0
+#define RECORD_LOCAL_TIMESTAMP 1
+#define RECORD_OFFSET_TIMESTAMP 2
+static inline void __write_record(
+        uint8_t event,
+        uint8_t type,
+        uint16_t pid_fragment,
+        unsigned int irq_count,
+        int record_irq,
+        int hide_irq,
+        uint64_t timestamp,
+        int record_timestamp,
+        int only_single_writer,
+        int is_cpu_timestamp,
+        int local_cpu,
+        uint8_t other_cpu)
+{
+        unsigned long flags;
+        unsigned int seq_no;
+        struct timestamp *ts;
+        int cpu;
+        struct ft_buffer* buf;
+        /* Avoid preemptions while recording the timestamp. This reduces the
+         * number of "out of order" timestamps in the stream and makes
+         * post-processing easier. */
+        local_irq_save(flags);
+        if (local_cpu)
+                cpu = smp_processor_id();
+        else
+                cpu = other_cpu;
+        /* resolved during function inlining */
+        if (is_cpu_timestamp) {
+                seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
+                buf = cpu_trace_ts_buf(cpu);
+        } else {
+                seq_no = atomic_fetch_inc(&per_cpu(msg_ts_seq_no, cpu));
+                buf = msg_trace_ts_buf(cpu);
+        }
+        /* If buf is non-NULL here, then the buffer cannot be deallocated until
+         * we turn interrupts on again. This is because free_timestamp_buffer()
+         * indirectly causes TLB invalidations due to modifications of the
+         * kernel address space, namely via vfree() in free_ft_buffer(), which
+         * cannot be processed until we turn on interrupts again.
+         */
+        if (buf &&
+            (only_single_writer /* resolved during function inlining */
+             ? ft_buffer_start_single_write(buf, (void**)  &ts)
+             : ft_buffer_start_write(buf, (void**) &ts))) {
+                ts->event     = event;
+                ts->seq_no    = seq_no;
+                ts->task_type = type;
+                ts->pid       = pid_fragment;
+                ts->cpu       = cpu;
+                if (record_irq) {
+                        if (local_cpu)
+                                irq_count = cpu_get_and_clear_irq_fired();
+                        else
+                                irq_count = get_and_clear_irq_fired_for_cpu(cpu);
+                }
+                save_irq_flags(ts, irq_count - hide_irq);
+                if (record_timestamp)
+                        timestamp = ft_timestamp();
+                if (record_timestamp == RECORD_OFFSET_TIMESTAMP)
+                        timestamp += cycle_offset[smp_processor_id()][cpu];
+                ts->timestamp = timestamp;
+                ft_buffer_finish_write(buf, ts);
+        }
+        local_irq_restore(flags);
+}
+static inline void write_cpu_timestamp(
+        uint8_t event,
+        uint8_t type,
+        uint16_t pid_fragment,
+        unsigned int irq_count,
+        int record_irq,
+        int hide_irq,
+        uint64_t timestamp,
+        int record_timestamp)
+{
+        __write_record(event, type,
+                       pid_fragment,
+                       irq_count, record_irq, hide_irq,
+                       timestamp, record_timestamp,
+                       1 /* only_single_writer */,
+                       1 /* is_cpu_timestamp */,
+                       1 /* local_cpu */,
+                       0xff /* other_cpu */);
+}
+static inline void save_msg_timestamp(
+        uint8_t event,
+        int hide_irq)
+{
+        struct task_struct *t  = current;
+        __write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
+                       t->pid,
+                       0, LOCAL_IRQ_COUNT, hide_irq,
+                       0, RECORD_LOCAL_TIMESTAMP,
+                       0 /* only_single_writer */,
+                       0 /* is_cpu_timestamp */,
+                       1 /* local_cpu */,
+                       0xff /* other_cpu */);
+}
+static inline void save_remote_msg_timestamp(
+        uint8_t event,
+        uint8_t remote_cpu)
+{
+        struct task_struct *t  = current;
+        __write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
+                       t->pid,
+                       0, REMOTE_IRQ_COUNT, 0,
+                       0, RECORD_OFFSET_TIMESTAMP,
+                       0 /* only_single_writer */,
+                       0 /* is_cpu_timestamp */,
+                       0 /* local_cpu */,
+                       remote_cpu);
+}
+feather_callback void save_cpu_timestamp_def(unsigned long event,
+                                             unsigned long type)
+{
+        write_cpu_timestamp(event, type,
+                            current->pid,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            0, RECORD_LOCAL_TIMESTAMP);
+}
+feather_callback void save_cpu_timestamp_task(unsigned long event,
+                                              unsigned long t_ptr)
+{
+        struct task_struct *t = (struct task_struct *) t_ptr;
+        int rt = is_realtime(t);
+        write_cpu_timestamp(event, rt ? TSK_RT : TSK_BE,
+                            t->pid,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            0, RECORD_LOCAL_TIMESTAMP);
+}
+/* fake timestamp to user-reported time */
+feather_callback void save_cpu_timestamp_time(unsigned long event,
+                         unsigned long ptr)
+{
+        uint64_t* time = (uint64_t*) ptr;
+        write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                            current->pid,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            *time, DO_NOT_RECORD_TIMESTAMP);
+}
+/* Record user-reported IRQ count */
+feather_callback void save_cpu_timestamp_irq(unsigned long event,
+                        unsigned long irq_counter_ptr)
+{
+        uint64_t* irqs = (uint64_t*) irq_counter_ptr;
+        write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                            current->pid,
+                            *irqs, NO_IRQ_COUNT, 0,
+                            0, RECORD_LOCAL_TIMESTAMP);
+}
+/* Record to remote trace buffer */
+feather_callback void msg_sent_to(unsigned long event, unsigned long to)
+{
+        save_remote_msg_timestamp(event, to);
+}
+/* Record to local trace buffer */
+feather_callback void msg_sent_local(unsigned long event)
+{
+        save_msg_timestamp(event, 0);
+}
+/* Suppresses one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
+ * called from within an interrupt that is expected. */
+feather_callback void msg_received_local(unsigned long event)
+{
+        save_msg_timestamp(event, 1);
+}
+/* Record to remote trace buffer */
+feather_callback void msg_received_from(unsigned long event, unsigned long from)
+{
+        save_remote_msg_timestamp(event, from);
+}
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+        unsigned long flags;
+        unsigned int seq_no;
+        struct timestamp *ts;
+        struct ft_buffer* buf;
+        int cpu;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        buf = cpu_trace_ts_buf(cpu);
+        seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
+        if (buf && ft_buffer_start_single_write(buf, (void**)  &ts)) {
+                *ts = *pre_recorded;
+                ts->seq_no = seq_no;
+                ts->cpu    = raw_smp_processor_id();
+                save_irq_flags(ts, get_and_clear_irq_fired());
+                ft_buffer_finish_write(buf, ts);
+        }
+        local_irq_restore(flags);
+}
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+struct calibrate_info {
+        atomic_t ready;
+        uint64_t cycle_count;
+};
+static void calibrate_helper(void *_info)
+{
+        struct calibrate_info *info = _info;
+        /* check in with master */
+        atomic_inc(&info->ready);
+        /* wait for master to signal start */
+        while (atomic_read(&info->ready))
+                cpu_relax();
+        /* report time stamp */
+        info->cycle_count = ft_timestamp();
+        /* tell master that we are done */
+        atomic_inc(&info->ready);
+}
+static int64_t calibrate_cpu(int cpu)
+{
+        uint64_t cycles;
+        struct calibrate_info info;
+        unsigned long flags;
+        int64_t  delta;
+        atomic_set(&info.ready, 0);
+        info.cycle_count = 0;
+        smp_wmb();
+        smp_call_function_single(cpu, calibrate_helper, &info, 0);
+        /* wait for helper to become active */
+        while (!atomic_read(&info.ready))
+                cpu_relax();
+        /* avoid interrupt interference */
+        local_irq_save(flags);
+        /* take measurement */
+        atomic_set(&info.ready, 0);
+        smp_wmb();
+        cycles = ft_timestamp();
+        /* wait for helper reading */
+        while (!atomic_read(&info.ready))
+                cpu_relax();
+        /* positive offset: the other guy is ahead of us */
+        delta  = (int64_t) info.cycle_count;
+        delta -= (int64_t) cycles;
+        local_irq_restore(flags);
+        return delta;
+}
+#define NUM_SAMPLES 10
+static long calibrate_tsc_offsets(struct ftdev* ftdev, unsigned int idx,
+                                  unsigned long uarg)
+{
+        int cpu, self, i;
+        int64_t delta, sample;
+        preempt_disable();
+        self = smp_processor_id();
+        if (uarg)
+                printk(KERN_INFO "Feather-Trace: determining TSC offsets for P%d\n", self);
+        for_each_online_cpu(cpu)
+                if (cpu != self) {
+                        delta = calibrate_cpu(cpu);
+                        for (i = 1; i < NUM_SAMPLES; i++) {
+                                sample = calibrate_cpu(cpu);
+                                delta = sample < delta ? sample : delta;
+                        }
+                        cycle_offset[self][cpu] = delta;
+                        if (uarg)
+                                printk(KERN_INFO "Feather-Trace: TSC offset for P%d->P%d is %lld cycles.\n",
+                                       self, cpu, cycle_offset[self][cpu]);
+                }
+        preempt_enable();
+        return 0;
+}
+#define NO_TIMESTAMPS (2 << CONFIG_SCHED_OVERHEAD_TRACE_SHIFT)
+static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+        unsigned int count = NO_TIMESTAMPS;
+        /* An overhead-tracing timestamp should be exactly 16 bytes long. */
+        BUILD_BUG_ON(sizeof(struct timestamp) != 16);
+        while (count && !ftdev->minor[idx].buf) {
+                printk("time stamp buffer: trying to allocate %u time stamps for minor=%u.\n", count, idx);
+                ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+                count /= 2;
+        }
+        return ftdev->minor[idx].buf ? 0 : -ENOMEM;
+}
+static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+        ftdev->minor[idx].buf = NULL;
+        /* Make sure all cores have actually seen buf == NULL before
+         * yanking out the mappings from underneath them. */
+        smp_wmb();
+        free_ft_buffer(ftdev->minor[idx].buf);
+}
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+                                         const char __user *from)
+{
+        ssize_t consumed = 0;
+        struct timestamp ts;
+        /* don't give us partial timestamps */
+        if (len % sizeof(ts))
+                return -EINVAL;
+        while (len >= sizeof(ts)) {
+                if (copy_from_user(&ts, from, sizeof(ts))) {
+                        consumed = -EFAULT;
+                        goto out;
+                }
+                len  -= sizeof(ts);
+                from += sizeof(ts);
+                consumed += sizeof(ts);
+                /* Note: this always adds to the buffer of the CPU-local
+                 * device, not necessarily to the device that the system call
+                 * was invoked on. This is admittedly a bit ugly, but requiring
+                 * tasks to only write to the appropriate device would make
+                 * tracing from userspace under global and clustered scheduling
+                 * exceedingly difficult. Writing to remote buffers would
+                 * require to not use ft_buffer_start_single_write(), which we
+                 * want to do to reduce the number of atomic ops in the common
+                 * case (which is the recording of CPU-local scheduling
+                 * overheads).
+                 */
+                __add_timestamp_user(&ts);
+        }
+out:
+        return consumed;
+}
+static int __init init_cpu_ft_overhead_trace(void)
+{
+        int err, cpu;
+        printk("Initializing Feather-Trace per-cpu overhead tracing device.\n");
+        err = ftdev_init(&cpu_overhead_dev, THIS_MODULE,
+                         num_online_cpus(), "ft_cpu_trace");
+        if (err)
+                goto err_out;
+        cpu_overhead_dev.alloc = alloc_timestamp_buffer;
+        cpu_overhead_dev.free  = free_timestamp_buffer;
+        cpu_overhead_dev.write = write_timestamp_from_user;
+        err = register_ftdev(&cpu_overhead_dev);
+        if (err)
+                goto err_dealloc;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                per_cpu(cpu_ts_seq_no, cpu) = 0;
+        }
+        return 0;
+err_dealloc:
+        ftdev_exit(&cpu_overhead_dev);
+err_out:
+        printk(KERN_WARNING "Could not register per-cpu ft_trace device.\n");
+        return err;
+}
+static int __init init_msg_ft_overhead_trace(void)
+{
+        int err, cpu;
+        printk("Initializing Feather-Trace per-cpu message overhead tracing device.\n");
+        err = ftdev_init(&msg_overhead_dev, THIS_MODULE,
+                         num_online_cpus(), "ft_msg_trace");
+        if (err)
+                goto err_out;
+        msg_overhead_dev.alloc = alloc_timestamp_buffer;
+        msg_overhead_dev.free  = free_timestamp_buffer;
+        msg_overhead_dev.calibrate = calibrate_tsc_offsets;
+        err = register_ftdev(&msg_overhead_dev);
+        if (err)
+                goto err_dealloc;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                atomic_set(&per_cpu(msg_ts_seq_no, cpu), 0);
+        }
+        return 0;
+err_dealloc:
+        ftdev_exit(&msg_overhead_dev);
+err_out:
+        printk(KERN_WARNING "Could not register message ft_trace device.\n");
+        return err;
+}
+static int __init init_ft_overhead_trace(void)
+{
+        int err, i, j;
+        for (i = 0; i < NR_CPUS; i++)
+                for (j = 0; j < NR_CPUS; j++)
+                        cycle_offset[i][j] = 0;
+        err = init_cpu_ft_overhead_trace();
+        if (err)
+                return err;
+        err = init_msg_ft_overhead_trace();
+        if (err)
+                ftdev_exit(&cpu_overhead_dev);
+                return err;
+        return 0;
+}
+static void __exit exit_ft_overhead_trace(void)
+{
+        ftdev_exit(&cpu_overhead_dev);
+        ftdev_exit(&msg_overhead_dev);
+}
+module_init(init_ft_overhead_trace);
+module_exit(exit_ft_overhead_trace);
author	Bjoern Brandenburg <bbb@mpi-sws.org>	2015-08-09 07:18:45 -0400
committer	Bjoern Brandenburg <bbb@mpi-sws.org>	2017-05-26 17:12:21 -0400
commit	d13b06b1c18565476b0849ecd032aaf6d326b877 (patch)
tree	7b9b606c8c433428587e889f82dade7da89ba721 /litmus/trace.c
parent	d0ec47c1fd203fa87071fda00f8cf0a1826d6a34 (diff)

diff --git a/litmus/trace.c b/litmus/trace.c new file mode 100644 index 000000000000..64d63943f65d --- /dev/null +++ b/litmus/trace.c
@@ -0,0 +1,562 @@
	1	#include <linux/sched.h>
	2	#include <linux/module.h>
	3	#include <linux/uaccess.h>
	4
	5	#include <litmus/ftdev.h>
	6	#include <litmus/trace.h>
	7
	8	/* dummy definition of is_realtime() */
	9	#define is_realtime(t) (0)
	10
	11	/******************************************************************************/
	12	/* Allocation */
	13	/******************************************************************************/
	14
	15	static struct ftdev cpu_overhead_dev;
	16	static struct ftdev msg_overhead_dev;
	17
	18	#define cpu_trace_ts_buf(cpu) cpu_overhead_dev.minor[(cpu)].buf
	19	#define msg_trace_ts_buf(cpu) msg_overhead_dev.minor[(cpu)].buf
	20
	21	DEFINE_PER_CPU(atomic_t, irq_fired_count;)
	22	DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, cpu_irq_fired_count);
	23
	24	static DEFINE_PER_CPU(unsigned int, cpu_ts_seq_no);
	25	static DEFINE_PER_CPU(atomic_t, msg_ts_seq_no);
	26
	27	static int64_t cycle_offset[NR_CPUS][NR_CPUS];
	28
	29	void ft_irq_fired(void)
	30	{
	31	/* Only called with preemptions disabled. */
	32	atomic_inc(this_cpu_ptr(&irq_fired_count));
	33	atomic_inc(this_cpu_ptr(&cpu_irq_fired_count));
	34	}
	35
	36	static inline void clear_irq_fired(void)
	37	{
	38	atomic_set(raw_cpu_ptr(&irq_fired_count), 0);
	39	}
	40
	41	static inline unsigned int get_and_clear_irq_fired(void)
	42	{
	43	/* This is potentially not atomic since we might migrate if
	44	* preemptions are not disabled. As a tradeoff between
	45	* accuracy and tracing overheads, this seems acceptable.
	46	* If it proves to be a problem, then one could add a callback
	47	* from the migration code to invalidate irq_fired_count.
	48	*/
	49	return atomic_xchg(raw_cpu_ptr(&irq_fired_count), 0);
	50	}
	51
	52	static inline unsigned int get_and_clear_irq_fired_for_cpu(int cpu)
	53	{
	54	return atomic_xchg(&per_cpu(irq_fired_count, cpu), 0);
	55	}
	56
	57	static inline void cpu_clear_irq_fired(void)
	58	{
	59	atomic_set(raw_cpu_ptr(&cpu_irq_fired_count), 0);
	60	}
	61
	62	static inline unsigned int cpu_get_and_clear_irq_fired(void)
	63	{
	64	return atomic_xchg(raw_cpu_ptr(&cpu_irq_fired_count), 0);
	65	}
	66
	67	static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
	68	{
	69	/* Store how many interrupts occurred. */
	70	ts->irq_count = irq_count;
	71	/* Extra flag because ts->irq_count overflows quickly. */
	72	ts->irq_flag = irq_count > 0;
	73	}
	74
	75	#define NO_IRQ_COUNT 0
	76	#define LOCAL_IRQ_COUNT 1
	77	#define REMOTE_IRQ_COUNT 2
	78
	79	#define DO_NOT_RECORD_TIMESTAMP 0
	80	#define RECORD_LOCAL_TIMESTAMP 1
	81	#define RECORD_OFFSET_TIMESTAMP 2
	82
	83	static inline void __write_record(
	84	uint8_t event,
	85	uint8_t type,
	86	uint16_t pid_fragment,
	87	unsigned int irq_count,
	88	int record_irq,
	89	int hide_irq,
	90	uint64_t timestamp,
	91	int record_timestamp,
	92
	93	int only_single_writer,
	94	int is_cpu_timestamp,
	95	int local_cpu,
	96	uint8_t other_cpu)
	97	{
	98	unsigned long flags;
	99	unsigned int seq_no;
	100	struct timestamp *ts;
	101	int cpu;
	102	struct ft_buffer* buf;
	103
	104	/* Avoid preemptions while recording the timestamp. This reduces the
	105	* number of "out of order" timestamps in the stream and makes
	106	* post-processing easier. */
	107
	108	local_irq_save(flags);
	109
	110	if (local_cpu)
	111	cpu = smp_processor_id();
	112	else
	113	cpu = other_cpu;
	114
	115	/* resolved during function inlining */
	116	if (is_cpu_timestamp) {
	117	seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
	118	buf = cpu_trace_ts_buf(cpu);
	119	} else {
	120	seq_no = atomic_fetch_inc(&per_cpu(msg_ts_seq_no, cpu));
	121	buf = msg_trace_ts_buf(cpu);
	122	}
	123
	124	/* If buf is non-NULL here, then the buffer cannot be deallocated until
	125	* we turn interrupts on again. This is because free_timestamp_buffer()
	126	* indirectly causes TLB invalidations due to modifications of the
	127	* kernel address space, namely via vfree() in free_ft_buffer(), which
	128	* cannot be processed until we turn on interrupts again.
	129	*/
	130
	131	if (buf &&
	132	(only_single_writer /* resolved during function inlining */
	133	? ft_buffer_start_single_write(buf, (void**) &ts)
	134	: ft_buffer_start_write(buf, (void**) &ts))) {
	135	ts->event = event;
	136	ts->seq_no = seq_no;
	137
	138	ts->task_type = type;
	139	ts->pid = pid_fragment;
	140
	141	ts->cpu = cpu;
	142
	143	if (record_irq) {
	144	if (local_cpu)
	145	irq_count = cpu_get_and_clear_irq_fired();
	146	else
	147	irq_count = get_and_clear_irq_fired_for_cpu(cpu);
	148	}
	149
	150	save_irq_flags(ts, irq_count - hide_irq);
	151
	152	if (record_timestamp)
	153	timestamp = ft_timestamp();
	154	if (record_timestamp == RECORD_OFFSET_TIMESTAMP)
	155	timestamp += cycle_offset[smp_processor_id()][cpu];
	156
	157	ts->timestamp = timestamp;
	158	ft_buffer_finish_write(buf, ts);
	159	}
	160
	161	local_irq_restore(flags);
	162	}
	163
	164
	165	static inline void write_cpu_timestamp(
	166	uint8_t event,
	167	uint8_t type,
	168	uint16_t pid_fragment,
	169	unsigned int irq_count,
	170	int record_irq,
	171	int hide_irq,
	172	uint64_t timestamp,
	173	int record_timestamp)
	174	{
	175	__write_record(event, type,
	176	pid_fragment,
	177	irq_count, record_irq, hide_irq,
	178	timestamp, record_timestamp,
	179	1 /* only_single_writer */,
	180	1 /* is_cpu_timestamp */,
	181	1 /* local_cpu */,
	182	0xff /* other_cpu */);
	183	}
	184
	185	static inline void save_msg_timestamp(
	186	uint8_t event,
	187	int hide_irq)
	188	{
	189	struct task_struct *t = current;
	190	__write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
	191	t->pid,
	192	0, LOCAL_IRQ_COUNT, hide_irq,
	193	0, RECORD_LOCAL_TIMESTAMP,
	194	0 /* only_single_writer */,
	195	0 /* is_cpu_timestamp */,
	196	1 /* local_cpu */,
	197	0xff /* other_cpu */);
	198	}
	199
	200	static inline void save_remote_msg_timestamp(
	201	uint8_t event,
	202	uint8_t remote_cpu)
	203	{
	204	struct task_struct *t = current;
	205	__write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
	206	t->pid,
	207	0, REMOTE_IRQ_COUNT, 0,
	208	0, RECORD_OFFSET_TIMESTAMP,
	209	0 /* only_single_writer */,
	210	0 /* is_cpu_timestamp */,
	211	0 /* local_cpu */,
	212	remote_cpu);
	213	}
	214
	215	feather_callback void save_cpu_timestamp_def(unsigned long event,
	216	unsigned long type)
	217	{
	218	write_cpu_timestamp(event, type,
	219	current->pid,
	220	0, LOCAL_IRQ_COUNT, 0,
	221	0, RECORD_LOCAL_TIMESTAMP);
	222	}
	223
	224	feather_callback void save_cpu_timestamp_task(unsigned long event,
	225	unsigned long t_ptr)
	226	{
	227	struct task_struct t = (struct task_struct ) t_ptr;
	228	int rt = is_realtime(t);
	229
	230	write_cpu_timestamp(event, rt ? TSK_RT : TSK_BE,
	231	t->pid,
	232	0, LOCAL_IRQ_COUNT, 0,
	233	0, RECORD_LOCAL_TIMESTAMP);
	234	}
	235
	236	/* fake timestamp to user-reported time */
	237	feather_callback void save_cpu_timestamp_time(unsigned long event,
	238	unsigned long ptr)
	239	{
	240	uint64_t* time = (uint64_t*) ptr;
	241
	242	write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
	243	current->pid,
	244	0, LOCAL_IRQ_COUNT, 0,
	245	*time, DO_NOT_RECORD_TIMESTAMP);
	246	}
	247
	248	/* Record user-reported IRQ count */
	249	feather_callback void save_cpu_timestamp_irq(unsigned long event,
	250	unsigned long irq_counter_ptr)
	251	{
	252	uint64_t* irqs = (uint64_t*) irq_counter_ptr;
	253
	254	write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
	255	current->pid,
	256	*irqs, NO_IRQ_COUNT, 0,
	257	0, RECORD_LOCAL_TIMESTAMP);
	258	}
	259
	260	/* Record to remote trace buffer */
	261	feather_callback void msg_sent_to(unsigned long event, unsigned long to)
	262	{
	263	save_remote_msg_timestamp(event, to);
	264	}
	265
	266	/* Record to local trace buffer */
	267	feather_callback void msg_sent_local(unsigned long event)
	268	{
	269	save_msg_timestamp(event, 0);
	270	}
	271
	272	/* Suppresses one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
	273	* called from within an interrupt that is expected. */
	274	feather_callback void msg_received_local(unsigned long event)
	275	{
	276	save_msg_timestamp(event, 1);
	277	}
	278
	279	/* Record to remote trace buffer */
	280	feather_callback void msg_received_from(unsigned long event, unsigned long from)
	281	{
	282	save_remote_msg_timestamp(event, from);
	283	}
	284
	285	static void __add_timestamp_user(struct timestamp *pre_recorded)
	286	{
	287	unsigned long flags;
	288	unsigned int seq_no;
	289	struct timestamp *ts;
	290	struct ft_buffer* buf;
	291	int cpu;
	292
	293	local_irq_save(flags);
	294
	295	cpu = smp_processor_id();
	296	buf = cpu_trace_ts_buf(cpu);
	297
	298	seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
	299	if (buf && ft_buffer_start_single_write(buf, (void**) &ts)) {
	300	ts = pre_recorded;
	301	ts->seq_no = seq_no;
	302	ts->cpu = raw_smp_processor_id();
	303	save_irq_flags(ts, get_and_clear_irq_fired());
	304	ft_buffer_finish_write(buf, ts);
	305	}
	306
	307	local_irq_restore(flags);
	308	}
	309
	310	/******************************************************************************/
	311	/* DEVICE FILE DRIVER */
	312	/******************************************************************************/
	313
	314	struct calibrate_info {
	315	atomic_t ready;
	316
	317	uint64_t cycle_count;
	318	};
	319
	320	static void calibrate_helper(void *_info)
	321	{
	322	struct calibrate_info *info = _info;
	323	/* check in with master */
	324	atomic_inc(&info->ready);
	325
	326	/* wait for master to signal start */
	327	while (atomic_read(&info->ready))
	328	cpu_relax();
	329
	330	/* report time stamp */
	331	info->cycle_count = ft_timestamp();
	332
	333	/* tell master that we are done */
	334	atomic_inc(&info->ready);
	335	}
	336
	337
	338	static int64_t calibrate_cpu(int cpu)
	339	{
	340	uint64_t cycles;
	341	struct calibrate_info info;
	342	unsigned long flags;
	343	int64_t delta;
	344
	345	atomic_set(&info.ready, 0);
	346	info.cycle_count = 0;
	347	smp_wmb();
	348
	349	smp_call_function_single(cpu, calibrate_helper, &info, 0);
	350
	351	/* wait for helper to become active */
	352	while (!atomic_read(&info.ready))
	353	cpu_relax();
	354
	355	/* avoid interrupt interference */
	356	local_irq_save(flags);
	357
	358	/* take measurement */
	359	atomic_set(&info.ready, 0);
	360	smp_wmb();
	361	cycles = ft_timestamp();
	362
	363	/* wait for helper reading */
	364	while (!atomic_read(&info.ready))
	365	cpu_relax();
	366
	367	/* positive offset: the other guy is ahead of us */
	368	delta = (int64_t) info.cycle_count;
	369	delta -= (int64_t) cycles;
	370
	371	local_irq_restore(flags);
	372
	373	return delta;
	374	}
	375
	376	#define NUM_SAMPLES 10
	377
	378	static long calibrate_tsc_offsets(struct ftdev* ftdev, unsigned int idx,
	379	unsigned long uarg)
	380	{
	381	int cpu, self, i;
	382	int64_t delta, sample;
	383
	384	preempt_disable();
	385	self = smp_processor_id();
	386
	387	if (uarg)
	388	printk(KERN_INFO "Feather-Trace: determining TSC offsets for P%d\n", self);
	389
	390	for_each_online_cpu(cpu)
	391	if (cpu != self) {
	392	delta = calibrate_cpu(cpu);
	393	for (i = 1; i < NUM_SAMPLES; i++) {
	394	sample = calibrate_cpu(cpu);
	395	delta = sample < delta ? sample : delta;
	396	}
	397
	398	cycle_offset[self][cpu] = delta;
	399
	400	if (uarg)
	401	printk(KERN_INFO "Feather-Trace: TSC offset for P%d->P%d is %lld cycles.\n",
	402	self, cpu, cycle_offset[self][cpu]);
	403	}
	404
	405	preempt_enable();
	406	return 0;
	407	}
	408
	409	#define NO_TIMESTAMPS (2 << CONFIG_SCHED_OVERHEAD_TRACE_SHIFT)
	410
	411	static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
	412	{
	413	unsigned int count = NO_TIMESTAMPS;
	414
	415	/* An overhead-tracing timestamp should be exactly 16 bytes long. */
	416	BUILD_BUG_ON(sizeof(struct timestamp) != 16);
	417
	418	while (count && !ftdev->minor[idx].buf) {
	419	printk("time stamp buffer: trying to allocate %u time stamps for minor=%u.\n", count, idx);
	420	ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
	421	count /= 2;
	422	}
	423	return ftdev->minor[idx].buf ? 0 : -ENOMEM;
	424	}
	425
	426	static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
	427	{
	428	ftdev->minor[idx].buf = NULL;
	429	/* Make sure all cores have actually seen buf == NULL before
	430	* yanking out the mappings from underneath them. */
	431	smp_wmb();
	432	free_ft_buffer(ftdev->minor[idx].buf);
	433	}
	434
	435	static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
	436	const char __user *from)
	437	{
	438	ssize_t consumed = 0;
	439	struct timestamp ts;
	440
	441	/* don't give us partial timestamps */
	442	if (len % sizeof(ts))
	443	return -EINVAL;
	444
	445	while (len >= sizeof(ts)) {
	446	if (copy_from_user(&ts, from, sizeof(ts))) {
	447	consumed = -EFAULT;
	448	goto out;
	449	}
	450	len -= sizeof(ts);
	451	from += sizeof(ts);
	452	consumed += sizeof(ts);
	453
	454	/* Note: this always adds to the buffer of the CPU-local
	455	* device, not necessarily to the device that the system call
	456	* was invoked on. This is admittedly a bit ugly, but requiring
	457	* tasks to only write to the appropriate device would make
	458	* tracing from userspace under global and clustered scheduling
	459	* exceedingly difficult. Writing to remote buffers would
	460	* require to not use ft_buffer_start_single_write(), which we
	461	* want to do to reduce the number of atomic ops in the common
	462	* case (which is the recording of CPU-local scheduling
	463	* overheads).
	464	*/
	465	__add_timestamp_user(&ts);
	466	}
	467
	468	out:
	469	return consumed;
	470	}
	471
	472	static int __init init_cpu_ft_overhead_trace(void)
	473	{
	474	int err, cpu;
	475
	476	printk("Initializing Feather-Trace per-cpu overhead tracing device.\n");
	477	err = ftdev_init(&cpu_overhead_dev, THIS_MODULE,
	478	num_online_cpus(), "ft_cpu_trace");
	479	if (err)
	480	goto err_out;
	481
	482	cpu_overhead_dev.alloc = alloc_timestamp_buffer;
	483	cpu_overhead_dev.free = free_timestamp_buffer;
	484	cpu_overhead_dev.write = write_timestamp_from_user;
	485
	486	err = register_ftdev(&cpu_overhead_dev);
	487	if (err)
	488	goto err_dealloc;
	489
	490	for (cpu = 0; cpu < NR_CPUS; cpu++) {
	491	per_cpu(cpu_ts_seq_no, cpu) = 0;
	492	}
	493
	494	return 0;
	495
	496	err_dealloc:
	497	ftdev_exit(&cpu_overhead_dev);
	498	err_out:
	499	printk(KERN_WARNING "Could not register per-cpu ft_trace device.\n");
	500	return err;
	501	}
	502
	503	static int __init init_msg_ft_overhead_trace(void)
	504	{
	505	int err, cpu;
	506
	507	printk("Initializing Feather-Trace per-cpu message overhead tracing device.\n");
	508	err = ftdev_init(&msg_overhead_dev, THIS_MODULE,
	509	num_online_cpus(), "ft_msg_trace");
	510	if (err)
	511	goto err_out;
	512
	513	msg_overhead_dev.alloc = alloc_timestamp_buffer;
	514	msg_overhead_dev.free = free_timestamp_buffer;
	515	msg_overhead_dev.calibrate = calibrate_tsc_offsets;
	516
	517	err = register_ftdev(&msg_overhead_dev);
	518	if (err)
	519	goto err_dealloc;
	520
	521	for (cpu = 0; cpu < NR_CPUS; cpu++) {
	522	atomic_set(&per_cpu(msg_ts_seq_no, cpu), 0);
	523	}
	524
	525	return 0;
	526
	527	err_dealloc:
	528	ftdev_exit(&msg_overhead_dev);
	529	err_out:
	530	printk(KERN_WARNING "Could not register message ft_trace device.\n");
	531	return err;
	532	}
	533
	534
	535	static int __init init_ft_overhead_trace(void)
	536	{
	537	int err, i, j;
	538
	539	for (i = 0; i < NR_CPUS; i++)
	540	for (j = 0; j < NR_CPUS; j++)
	541	cycle_offset[i][j] = 0;
	542
	543	err = init_cpu_ft_overhead_trace();
	544	if (err)
	545	return err;
	546
	547	err = init_msg_ft_overhead_trace();
	548	if (err)
	549	ftdev_exit(&cpu_overhead_dev);
	550	return err;
	551
	552	return 0;
	553	}
	554
	555	static void __exit exit_ft_overhead_trace(void)
	556	{
	557	ftdev_exit(&cpu_overhead_dev);
	558	ftdev_exit(&msg_overhead_dev);
	559	}
	560
	561	module_init(init_ft_overhead_trace);
	562	module_exit(exit_ft_overhead_trace);