1 files changed, 563 insertions, 0 deletions
diff --git a/kernel/profile.c b/kernel/profile.c
new file mode 100644
index 000000000000..a38fa70075fe
--- /dev/null
+++ b/kernel/profile.c
@@ -0,0 +1,563 @@
+/*
+ *  linux/kernel/profile.c
+ *  Simple profiling. Manages a direct-mapped profile hit count buffer,
+ *  with configurable resolution, support for restricting the cpus on
+ *  which profiling is done, and switching between cpu time and
+ *  schedule() calls via kernel command line parameters passed at boot.
+ *
+ *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
+ *      Red Hat, July 2004
+ *  Consolidation of architecture support code for profiling,
+ *      William Irwin, Oracle, July 2004
+ *  Amortized hit count accounting via per-cpu open-addressed hashtables
+ *      to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/profile.h>
+#include <linux/highmem.h>
+#include <asm/sections.h>
+#include <asm/semaphore.h>
+struct profile_hit {
+        u32 pc, hits;
+};
+#define PROFILE_GRPSHIFT        3
+#define PROFILE_GRPSZ           (1 << PROFILE_GRPSHIFT)
+#define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
+#define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
+/* Oprofile timer tick hook */
+int (*timer_hook)(struct pt_regs *);
+static atomic_t *prof_buffer;
+static unsigned long prof_len, prof_shift;
+static int prof_on;
+static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
+static DEFINE_PER_CPU(int, cpu_profile_flip);
+static DECLARE_MUTEX(profile_flip_mutex);
+#endif /* CONFIG_SMP */
+static int __init profile_setup(char * str)
+{
+        int par;
+        if (!strncmp(str, "schedule", 8)) {
+                prof_on = SCHED_PROFILING;
+                printk(KERN_INFO "kernel schedule profiling enabled\n");
+                if (str[7] == ',')
+                        str += 8;
+        }
+        if (get_option(&str,&par)) {
+                prof_shift = par;
+                prof_on = CPU_PROFILING;
+                printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+                        prof_shift);
+        }
+        return 1;
+}
+__setup("profile=", profile_setup);
+void __init profile_init(void)
+{
+        if (!prof_on) 
+                return;
+ 
+        /* only text is profiled */
+        prof_len = (_etext - _stext) >> prof_shift;
+        prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
+}
+/* Profile event notifications */
+ 
+#ifdef CONFIG_PROFILING
+ 
+static DECLARE_RWSEM(profile_rwsem);
+static DEFINE_RWLOCK(handoff_lock);
+static struct notifier_block * task_exit_notifier;
+static struct notifier_block * task_free_notifier;
+static struct notifier_block * munmap_notifier;
+ 
+void profile_task_exit(struct task_struct * task)
+{
+        down_read(&profile_rwsem);
+        notifier_call_chain(&task_exit_notifier, 0, task);
+        up_read(&profile_rwsem);
+}
+ 
+int profile_handoff_task(struct task_struct * task)
+{
+        int ret;
+        read_lock(&handoff_lock);
+        ret = notifier_call_chain(&task_free_notifier, 0, task);
+        read_unlock(&handoff_lock);
+        return (ret == NOTIFY_OK) ? 1 : 0;
+}
+void profile_munmap(unsigned long addr)
+{
+        down_read(&profile_rwsem);
+        notifier_call_chain(&munmap_notifier, 0, (void *)addr);
+        up_read(&profile_rwsem);
+}
+int task_handoff_register(struct notifier_block * n)
+{
+        int err = -EINVAL;
+        write_lock(&handoff_lock);
+        err = notifier_chain_register(&task_free_notifier, n);
+        write_unlock(&handoff_lock);
+        return err;
+}
+int task_handoff_unregister(struct notifier_block * n)
+{
+        int err = -EINVAL;
+        write_lock(&handoff_lock);
+        err = notifier_chain_unregister(&task_free_notifier, n);
+        write_unlock(&handoff_lock);
+        return err;
+}
+int profile_event_register(enum profile_type type, struct notifier_block * n)
+{
+        int err = -EINVAL;
+ 
+        down_write(&profile_rwsem);
+ 
+        switch (type) {
+                case PROFILE_TASK_EXIT:
+                        err = notifier_chain_register(&task_exit_notifier, n);
+                        break;
+                case PROFILE_MUNMAP:
+                        err = notifier_chain_register(&munmap_notifier, n);
+                        break;
+        }
+ 
+        up_write(&profile_rwsem);
+ 
+        return err;
+}
+ 
+int profile_event_unregister(enum profile_type type, struct notifier_block * n)
+{
+        int err = -EINVAL;
+ 
+        down_write(&profile_rwsem);
+ 
+        switch (type) {
+                case PROFILE_TASK_EXIT:
+                        err = notifier_chain_unregister(&task_exit_notifier, n);
+                        break;
+                case PROFILE_MUNMAP:
+                        err = notifier_chain_unregister(&munmap_notifier, n);
+                        break;
+        }
+        up_write(&profile_rwsem);
+        return err;
+}
+int register_timer_hook(int (*hook)(struct pt_regs *))
+{
+        if (timer_hook)
+                return -EBUSY;
+        timer_hook = hook;
+        return 0;
+}
+void unregister_timer_hook(int (*hook)(struct pt_regs *))
+{
+        WARN_ON(hook != timer_hook);
+        timer_hook = NULL;
+        /* make sure all CPUs see the NULL hook */
+        synchronize_kernel();
+}
+EXPORT_SYMBOL_GPL(register_timer_hook);
+EXPORT_SYMBOL_GPL(unregister_timer_hook);
+EXPORT_SYMBOL_GPL(task_handoff_register);
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
+#endif /* CONFIG_PROFILING */
+EXPORT_SYMBOL_GPL(profile_event_register);
+EXPORT_SYMBOL_GPL(profile_event_unregister);
+#ifdef CONFIG_SMP
+/*
+ * Each cpu has a pair of open-addressed hashtables for pending
+ * profile hits. read_profile() IPI's all cpus to request them
+ * to flip buffers and flushes their contents to prof_buffer itself.
+ * Flip requests are serialized by the profile_flip_mutex. The sole
+ * use of having a second hashtable is for avoiding cacheline
+ * contention that would otherwise happen during flushes of pending
+ * profile hits required for the accuracy of reported profile hits
+ * and so resurrect the interrupt livelock issue.
+ *
+ * The open-addressed hashtables are indexed by profile buffer slot
+ * and hold the number of pending hits to that profile buffer slot on
+ * a cpu in an entry. When the hashtable overflows, all pending hits
+ * are accounted to their corresponding profile buffer slots with
+ * atomic_add() and the hashtable emptied. As numerous pending hits
+ * may be accounted to a profile buffer slot in a hashtable entry,
+ * this amortizes a number of atomic profile buffer increments likely
+ * to be far larger than the number of entries in the hashtable,
+ * particularly given that the number of distinct profile buffer
+ * positions to which hits are accounted during short intervals (e.g.
+ * several seconds) is usually very small. Exclusion from buffer
+ * flipping is provided by interrupt disablement (note that for
+ * SCHED_PROFILING profile_hit() may be called from process context).
+ * The hash function is meant to be lightweight as opposed to strong,
+ * and was vaguely inspired by ppc64 firmware-supported inverted
+ * pagetable hash functions, but uses a full hashtable full of finite
+ * collision chains, not just pairs of them.
+ *
+ * -- wli
+ */
+static void __profile_flip_buffers(void *unused)
+{
+        int cpu = smp_processor_id();
+        per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
+}
+static void profile_flip_buffers(void)
+{
+        int i, j, cpu;
+        down(&profile_flip_mutex);
+        j = per_cpu(cpu_profile_flip, get_cpu());
+        put_cpu();
+        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+        for_each_online_cpu(cpu) {
+                struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
+                for (i = 0; i < NR_PROFILE_HIT; ++i) {
+                        if (!hits[i].hits) {
+                                if (hits[i].pc)
+                                        hits[i].pc = 0;
+                                continue;
+                        }
+                        atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+                        hits[i].hits = hits[i].pc = 0;
+                }
+        }
+        up(&profile_flip_mutex);
+}
+static void profile_discard_flip_buffers(void)
+{
+        int i, cpu;
+        down(&profile_flip_mutex);
+        i = per_cpu(cpu_profile_flip, get_cpu());
+        put_cpu();
+        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+        for_each_online_cpu(cpu) {
+                struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
+                memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
+        }
+        up(&profile_flip_mutex);
+}
+void profile_hit(int type, void *__pc)
+{
+        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
+        int i, j, cpu;
+        struct profile_hit *hits;
+        if (prof_on != type || !prof_buffer)
+                return;
+        pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
+        i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+        secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+        cpu = get_cpu();
+        hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
+        if (!hits) {
+                put_cpu();
+                return;
+        }
+        local_irq_save(flags);
+        do {
+                for (j = 0; j < PROFILE_GRPSZ; ++j) {
+                        if (hits[i + j].pc == pc) {
+                                hits[i + j].hits++;
+                                goto out;
+                        } else if (!hits[i + j].hits) {
+                                hits[i + j].pc = pc;
+                                hits[i + j].hits = 1;
+                                goto out;
+                        }
+                }
+                i = (i + secondary) & (NR_PROFILE_HIT - 1);
+        } while (i != primary);
+        atomic_inc(&prof_buffer[pc]);
+        for (i = 0; i < NR_PROFILE_HIT; ++i) {
+                atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+                hits[i].pc = hits[i].hits = 0;
+        }
+out:
+        local_irq_restore(flags);
+        put_cpu();
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static int __devinit profile_cpu_callback(struct notifier_block *info,
+                                        unsigned long action, void *__cpu)
+{
+        int node, cpu = (unsigned long)__cpu;
+        struct page *page;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                node = cpu_to_node(cpu);
+                per_cpu(cpu_profile_flip, cpu) = 0;
+                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
+                        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                        if (!page)
+                                return NOTIFY_BAD;
+                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
+                }
+                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
+                        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                        if (!page)
+                                goto out_free;
+                        per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
+                }
+                break;
+        out_free:
+                page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+                per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+                __free_page(page);
+                return NOTIFY_BAD;
+        case CPU_ONLINE:
+                cpu_set(cpu, prof_cpu_mask);
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_DEAD:
+                cpu_clear(cpu, prof_cpu_mask);
+                if (per_cpu(cpu_profile_hits, cpu)[0]) {
+                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+                        per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+                        __free_page(page);
+                }
+                if (per_cpu(cpu_profile_hits, cpu)[1]) {
+                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+                        per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+                        __free_page(page);
+                }
+                break;
+        }
+        return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#else /* !CONFIG_SMP */
+#define profile_flip_buffers()          do { } while (0)
+#define profile_discard_flip_buffers()  do { } while (0)
+void profile_hit(int type, void *__pc)
+{
+        unsigned long pc;
+        if (prof_on != type || !prof_buffer)
+                return;
+        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
+        atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+}
+#endif /* !CONFIG_SMP */
+void profile_tick(int type, struct pt_regs *regs)
+{
+        if (type == CPU_PROFILING && timer_hook)
+                timer_hook(regs);
+        if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+                profile_hit(type, (void *)profile_pc(regs));
+}
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <asm/ptrace.h>
+static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{
+        int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+        if (count - len < 2)
+                return -EINVAL;
+        len += sprintf(page + len, "\n");
+        return len;
+}
+static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
+                                        unsigned long count, void *data)
+{
+        cpumask_t *mask = (cpumask_t *)data;
+        unsigned long full_count = count, err;
+        cpumask_t new_value;
+        err = cpumask_parse(buffer, count, new_value);
+        if (err)
+                return err;
+        *mask = new_value;
+        return full_count;
+}
+void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+{
+        struct proc_dir_entry *entry;
+        /* create /proc/irq/prof_cpu_mask */
+        if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
+                return;
+        entry->nlink = 1;
+        entry->data = (void *)&prof_cpu_mask;
+        entry->read_proc = prof_cpu_mask_read_proc;
+        entry->write_proc = prof_cpu_mask_write_proc;
+}
+/*
+ * This function accesses profiling information. The returned data is
+ * binary: the sampling step and the actual contents of the profile
+ * buffer. Use of the program readprofile is recommended in order to
+ * get meaningful info out of these data.
+ */
+static ssize_t
+read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+        unsigned long p = *ppos;
+        ssize_t read;
+        char * pnt;
+        unsigned int sample_step = 1 << prof_shift;
+        profile_flip_buffers();
+        if (p >= (prof_len+1)*sizeof(unsigned int))
+                return 0;
+        if (count > (prof_len+1)*sizeof(unsigned int) - p)
+                count = (prof_len+1)*sizeof(unsigned int) - p;
+        read = 0;
+        while (p < sizeof(unsigned int) && count > 0) {
+                put_user(*((char *)(&sample_step)+p),buf);
+                buf++; p++; count--; read++;
+        }
+        pnt = (char *)prof_buffer + p - sizeof(atomic_t);
+        if (copy_to_user(buf,(void *)pnt,count))
+                return -EFAULT;
+        read += count;
+        *ppos += read;
+        return read;
+}
+/*
+ * Writing to /proc/profile resets the counters
+ *
+ * Writing a 'profiling multiplier' value into it also re-sets the profiling
+ * interrupt frequency, on architectures that support this.
+ */
+static ssize_t write_profile(struct file *file, const char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+#ifdef CONFIG_SMP
+        extern int setup_profiling_timer (unsigned int multiplier);
+        if (count == sizeof(int)) {
+                unsigned int multiplier;
+                if (copy_from_user(&multiplier, buf, sizeof(int)))
+                        return -EFAULT;
+                if (setup_profiling_timer(multiplier))
+                        return -EINVAL;
+        }
+#endif
+        profile_discard_flip_buffers();
+        memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
+        return count;
+}
+static struct file_operations proc_profile_operations = {
+        .read           = read_profile,
+        .write          = write_profile,
+};
+#ifdef CONFIG_SMP
+static void __init profile_nop(void *unused)
+{
+}
+static int __init create_hash_tables(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                int node = cpu_to_node(cpu);
+                struct page *page;
+                page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                if (!page)
+                        goto out_cleanup;
+                per_cpu(cpu_profile_hits, cpu)[1]
+                                = (struct profile_hit *)page_address(page);
+                page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                if (!page)
+                        goto out_cleanup;
+                per_cpu(cpu_profile_hits, cpu)[0]
+                                = (struct profile_hit *)page_address(page);
+        }
+        return 0;
+out_cleanup:
+        prof_on = 0;
+        mb();
+        on_each_cpu(profile_nop, NULL, 0, 1);
+        for_each_online_cpu(cpu) {
+                struct page *page;
+                if (per_cpu(cpu_profile_hits, cpu)[0]) {
+                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+                        per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+                        __free_page(page);
+                }
+                if (per_cpu(cpu_profile_hits, cpu)[1]) {
+                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+                        per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+                        __free_page(page);
+                }
+        }
+        return -1;
+}
+#else
+#define create_hash_tables()                    ({ 0; })
+#endif
+static int __init create_proc_profile(void)
+{
+        struct proc_dir_entry *entry;
+        if (!prof_on)
+                return 0;
+        if (create_hash_tables())
+                return -1;
+        if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
+                return 0;
+        entry->proc_fops = &proc_profile_operations;
+        entry->size = (1+prof_len) * sizeof(atomic_t);
+        hotcpu_notifier(profile_cpu_callback, 0);
+        return 0;
+}
+module_init(create_proc_profile);
+#endif /* CONFIG_PROC_FS */

diff --git a/kernel/profile.c b/kernel/profile.c new file mode 100644 index 000000000000..a38fa70075fe --- /dev/null +++ b/kernel/profile.c
@@ -0,0 +1,563 @@
	1	/*
	2	* linux/kernel/profile.c
	3	* Simple profiling. Manages a direct-mapped profile hit count buffer,
	4	* with configurable resolution, support for restricting the cpus on
	5	* which profiling is done, and switching between cpu time and
	6	* schedule() calls via kernel command line parameters passed at boot.
	7	*
	8	* Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
	9	* Red Hat, July 2004
	10	* Consolidation of architecture support code for profiling,
	11	* William Irwin, Oracle, July 2004
	12	* Amortized hit count accounting via per-cpu open-addressed hashtables
	13	* to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
	14	*/
	15
	16	#include <linux/config.h>
	17	#include <linux/module.h>
	18	#include <linux/profile.h>
	19	#include <linux/bootmem.h>
	20	#include <linux/notifier.h>
	21	#include <linux/mm.h>
	22	#include <linux/cpumask.h>
	23	#include <linux/cpu.h>
	24	#include <linux/profile.h>
	25	#include <linux/highmem.h>
	26	#include <asm/sections.h>
	27	#include <asm/semaphore.h>
	28
	29	struct profile_hit {
	30	u32 pc, hits;
	31	};
	32	#define PROFILE_GRPSHIFT 3
	33	#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
	34	#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
	35	#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
	36
	37	/* Oprofile timer tick hook */
	38	int (timer_hook)(struct pt_regs );
	39
	40	static atomic_t *prof_buffer;
	41	static unsigned long prof_len, prof_shift;
	42	static int prof_on;
	43	static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
	44	#ifdef CONFIG_SMP
	45	static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
	46	static DEFINE_PER_CPU(int, cpu_profile_flip);
	47	static DECLARE_MUTEX(profile_flip_mutex);
	48	#endif /* CONFIG_SMP */
	49
	50	static int __init profile_setup(char * str)
	51	{
	52	int par;
	53
	54	if (!strncmp(str, "schedule", 8)) {
	55	prof_on = SCHED_PROFILING;
	56	printk(KERN_INFO "kernel schedule profiling enabled\n");
	57	if (str[7] == ',')
	58	str += 8;
	59	}
	60	if (get_option(&str,&par)) {
	61	prof_shift = par;
	62	prof_on = CPU_PROFILING;
	63	printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
	64	prof_shift);
	65	}
	66	return 1;
	67	}
	68	__setup("profile=", profile_setup);
	69
	70
	71	void __init profile_init(void)
	72	{
	73	if (!prof_on)
	74	return;
	75
	76	/* only text is profiled */
	77	prof_len = (_etext - _stext) >> prof_shift;
	78	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
	79	}
	80
	81	/* Profile event notifications */
	82
	83	#ifdef CONFIG_PROFILING
	84
	85	static DECLARE_RWSEM(profile_rwsem);
	86	static DEFINE_RWLOCK(handoff_lock);
	87	static struct notifier_block * task_exit_notifier;
	88	static struct notifier_block * task_free_notifier;
	89	static struct notifier_block * munmap_notifier;
	90
	91	void profile_task_exit(struct task_struct * task)
	92	{
	93	down_read(&profile_rwsem);
	94	notifier_call_chain(&task_exit_notifier, 0, task);
	95	up_read(&profile_rwsem);
	96	}
	97
	98	int profile_handoff_task(struct task_struct * task)
	99	{
	100	int ret;
	101	read_lock(&handoff_lock);
	102	ret = notifier_call_chain(&task_free_notifier, 0, task);
	103	read_unlock(&handoff_lock);
	104	return (ret == NOTIFY_OK) ? 1 : 0;
	105	}
	106
	107	void profile_munmap(unsigned long addr)
	108	{
	109	down_read(&profile_rwsem);
	110	notifier_call_chain(&munmap_notifier, 0, (void *)addr);
	111	up_read(&profile_rwsem);
	112	}
	113
	114	int task_handoff_register(struct notifier_block * n)
	115	{
	116	int err = -EINVAL;
	117
	118	write_lock(&handoff_lock);
	119	err = notifier_chain_register(&task_free_notifier, n);
	120	write_unlock(&handoff_lock);
	121	return err;
	122	}
	123
	124	int task_handoff_unregister(struct notifier_block * n)
	125	{
	126	int err = -EINVAL;
	127
	128	write_lock(&handoff_lock);
	129	err = notifier_chain_unregister(&task_free_notifier, n);
	130	write_unlock(&handoff_lock);
	131	return err;
	132	}
	133
	134	int profile_event_register(enum profile_type type, struct notifier_block * n)
	135	{
	136	int err = -EINVAL;
	137
	138	down_write(&profile_rwsem);
	139
	140	switch (type) {
	141	case PROFILE_TASK_EXIT:
	142	err = notifier_chain_register(&task_exit_notifier, n);
	143	break;
	144	case PROFILE_MUNMAP:
	145	err = notifier_chain_register(&munmap_notifier, n);
	146	break;
	147	}
	148
	149	up_write(&profile_rwsem);
	150
	151	return err;
	152	}
	153
	154
	155	int profile_event_unregister(enum profile_type type, struct notifier_block * n)
	156	{
	157	int err = -EINVAL;
	158
	159	down_write(&profile_rwsem);
	160
	161	switch (type) {
	162	case PROFILE_TASK_EXIT:
	163	err = notifier_chain_unregister(&task_exit_notifier, n);
	164	break;
	165	case PROFILE_MUNMAP:
	166	err = notifier_chain_unregister(&munmap_notifier, n);
	167	break;
	168	}
	169
	170	up_write(&profile_rwsem);
	171	return err;
	172	}
	173
	174	int register_timer_hook(int (hook)(struct pt_regs ))
	175	{
	176	if (timer_hook)
	177	return -EBUSY;
	178	timer_hook = hook;
	179	return 0;
	180	}
	181
	182	void unregister_timer_hook(int (hook)(struct pt_regs ))
	183	{
	184	WARN_ON(hook != timer_hook);
	185	timer_hook = NULL;
	186	/* make sure all CPUs see the NULL hook */
	187	synchronize_kernel();
	188	}
	189
	190	EXPORT_SYMBOL_GPL(register_timer_hook);
	191	EXPORT_SYMBOL_GPL(unregister_timer_hook);
	192	EXPORT_SYMBOL_GPL(task_handoff_register);
	193	EXPORT_SYMBOL_GPL(task_handoff_unregister);
	194
	195	#endif /* CONFIG_PROFILING */
	196
	197	EXPORT_SYMBOL_GPL(profile_event_register);
	198	EXPORT_SYMBOL_GPL(profile_event_unregister);
	199
	200	#ifdef CONFIG_SMP
	201	/*
	202	* Each cpu has a pair of open-addressed hashtables for pending
	203	* profile hits. read_profile() IPI's all cpus to request them
	204	* to flip buffers and flushes their contents to prof_buffer itself.
	205	* Flip requests are serialized by the profile_flip_mutex. The sole
	206	* use of having a second hashtable is for avoiding cacheline
	207	* contention that would otherwise happen during flushes of pending
	208	* profile hits required for the accuracy of reported profile hits
	209	* and so resurrect the interrupt livelock issue.
	210	*
	211	* The open-addressed hashtables are indexed by profile buffer slot
	212	* and hold the number of pending hits to that profile buffer slot on
	213	* a cpu in an entry. When the hashtable overflows, all pending hits
	214	* are accounted to their corresponding profile buffer slots with
	215	* atomic_add() and the hashtable emptied. As numerous pending hits
	216	* may be accounted to a profile buffer slot in a hashtable entry,
	217	* this amortizes a number of atomic profile buffer increments likely
	218	* to be far larger than the number of entries in the hashtable,
	219	* particularly given that the number of distinct profile buffer
	220	* positions to which hits are accounted during short intervals (e.g.
	221	* several seconds) is usually very small. Exclusion from buffer
	222	* flipping is provided by interrupt disablement (note that for
	223	* SCHED_PROFILING profile_hit() may be called from process context).
	224	* The hash function is meant to be lightweight as opposed to strong,
	225	* and was vaguely inspired by ppc64 firmware-supported inverted
	226	* pagetable hash functions, but uses a full hashtable full of finite
	227	* collision chains, not just pairs of them.
	228	*
	229	* -- wli
	230	*/
	231	static void __profile_flip_buffers(void *unused)
	232	{
	233	int cpu = smp_processor_id();
	234
	235	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
	236	}
	237
	238	static void profile_flip_buffers(void)
	239	{
	240	int i, j, cpu;
	241
	242	down(&profile_flip_mutex);
	243	j = per_cpu(cpu_profile_flip, get_cpu());
	244	put_cpu();
	245	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
	246	for_each_online_cpu(cpu) {
	247	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
	248	for (i = 0; i < NR_PROFILE_HIT; ++i) {
	249	if (!hits[i].hits) {
	250	if (hits[i].pc)
	251	hits[i].pc = 0;
	252	continue;
	253	}
	254	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
	255	hits[i].hits = hits[i].pc = 0;
	256	}
	257	}
	258	up(&profile_flip_mutex);
	259	}
	260
	261	static void profile_discard_flip_buffers(void)
	262	{
	263	int i, cpu;
	264
	265	down(&profile_flip_mutex);
	266	i = per_cpu(cpu_profile_flip, get_cpu());
	267	put_cpu();
	268	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
	269	for_each_online_cpu(cpu) {
	270	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
	271	memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
	272	}
	273	up(&profile_flip_mutex);
	274	}
	275
	276	void profile_hit(int type, void *__pc)
	277	{
	278	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
	279	int i, j, cpu;
	280	struct profile_hit *hits;
	281
	282	if (prof_on != type \|\| !prof_buffer)
	283	return;
	284	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
	285	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	286	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	287	cpu = get_cpu();
	288	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
	289	if (!hits) {
	290	put_cpu();
	291	return;
	292	}
	293	local_irq_save(flags);
	294	do {
	295	for (j = 0; j < PROFILE_GRPSZ; ++j) {
	296	if (hits[i + j].pc == pc) {
	297	hits[i + j].hits++;
	298	goto out;
	299	} else if (!hits[i + j].hits) {
	300	hits[i + j].pc = pc;
	301	hits[i + j].hits = 1;
	302	goto out;
	303	}
	304	}
	305	i = (i + secondary) & (NR_PROFILE_HIT - 1);
	306	} while (i != primary);
	307	atomic_inc(&prof_buffer[pc]);
	308	for (i = 0; i < NR_PROFILE_HIT; ++i) {
	309	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
	310	hits[i].pc = hits[i].hits = 0;
	311	}
	312	out:
	313	local_irq_restore(flags);
	314	put_cpu();
	315	}
	316
	317	#ifdef CONFIG_HOTPLUG_CPU
	318	static int __devinit profile_cpu_callback(struct notifier_block *info,
	319	unsigned long action, void *__cpu)
	320	{
	321	int node, cpu = (unsigned long)__cpu;
	322	struct page *page;
	323
	324	switch (action) {
	325	case CPU_UP_PREPARE:
	326	node = cpu_to_node(cpu);
	327	per_cpu(cpu_profile_flip, cpu) = 0;
	328	if (!per_cpu(cpu_profile_hits, cpu)[1]) {
	329	page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, 0);
	330	if (!page)
	331	return NOTIFY_BAD;
	332	per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
	333	}
	334	if (!per_cpu(cpu_profile_hits, cpu)[0]) {
	335	page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, 0);
	336	if (!page)
	337	goto out_free;
	338	per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
	339	}
	340	break;
	341	out_free:
	342	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	343	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	344	__free_page(page);
	345	return NOTIFY_BAD;
	346	case CPU_ONLINE:
	347	cpu_set(cpu, prof_cpu_mask);
	348	break;
	349	case CPU_UP_CANCELED:
	350	case CPU_DEAD:
	351	cpu_clear(cpu, prof_cpu_mask);
	352	if (per_cpu(cpu_profile_hits, cpu)[0]) {
	353	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
	354	per_cpu(cpu_profile_hits, cpu)[0] = NULL;
	355	__free_page(page);
	356	}
	357	if (per_cpu(cpu_profile_hits, cpu)[1]) {
	358	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	359	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	360	__free_page(page);
	361	}
	362	break;
	363	}
	364	return NOTIFY_OK;
	365	}
	366	#endif /* CONFIG_HOTPLUG_CPU */
	367	#else /* !CONFIG_SMP */
	368	#define profile_flip_buffers() do { } while (0)
	369	#define profile_discard_flip_buffers() do { } while (0)
	370
	371	void profile_hit(int type, void *__pc)
	372	{
	373	unsigned long pc;
	374
	375	if (prof_on != type \|\| !prof_buffer)
	376	return;
	377	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
	378	atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
	379	}
	380	#endif /* !CONFIG_SMP */
	381
	382	void profile_tick(int type, struct pt_regs *regs)
	383	{
	384	if (type == CPU_PROFILING && timer_hook)
	385	timer_hook(regs);
	386	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
	387	profile_hit(type, (void *)profile_pc(regs));
	388	}
	389
	390	#ifdef CONFIG_PROC_FS
	391	#include <linux/proc_fs.h>
	392	#include <asm/uaccess.h>
	393	#include <asm/ptrace.h>
	394
	395	static int prof_cpu_mask_read_proc (char page, char *start, off_t off,
	396	int count, int eof, void data)
	397	{
	398	int len = cpumask_scnprintf(page, count, (cpumask_t )data);
	399	if (count - len < 2)
	400	return -EINVAL;
	401	len += sprintf(page + len, "\n");
	402	return len;
	403	}
	404
	405	static int prof_cpu_mask_write_proc (struct file file, const char __user buffer,
	406	unsigned long count, void *data)
	407	{
	408	cpumask_t mask = (cpumask_t )data;
	409	unsigned long full_count = count, err;
	410	cpumask_t new_value;
	411
	412	err = cpumask_parse(buffer, count, new_value);
	413	if (err)
	414	return err;
	415
	416	*mask = new_value;
	417	return full_count;
	418	}
	419
	420	void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
	421	{
	422	struct proc_dir_entry *entry;
	423
	424	/* create /proc/irq/prof_cpu_mask */
	425	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
	426	return;
	427	entry->nlink = 1;
	428	entry->data = (void *)&prof_cpu_mask;
	429	entry->read_proc = prof_cpu_mask_read_proc;
	430	entry->write_proc = prof_cpu_mask_write_proc;
	431	}
	432
	433	/*
	434	* This function accesses profiling information. The returned data is
	435	* binary: the sampling step and the actual contents of the profile
	436	* buffer. Use of the program readprofile is recommended in order to
	437	* get meaningful info out of these data.
	438	*/
	439	static ssize_t
	440	read_profile(struct file file, char __user buf, size_t count, loff_t *ppos)
	441	{
	442	unsigned long p = *ppos;
	443	ssize_t read;
	444	char * pnt;
	445	unsigned int sample_step = 1 << prof_shift;
	446
	447	profile_flip_buffers();
	448	if (p >= (prof_len+1)*sizeof(unsigned int))
	449	return 0;
	450	if (count > (prof_len+1)*sizeof(unsigned int) - p)
	451	count = (prof_len+1)*sizeof(unsigned int) - p;
	452	read = 0;
	453
	454	while (p < sizeof(unsigned int) && count > 0) {
	455	put_user(((char )(&sample_step)+p),buf);
	456	buf++; p++; count--; read++;
	457	}
	458	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
	459	if (copy_to_user(buf,(void *)pnt,count))
	460	return -EFAULT;
	461	read += count;
	462	*ppos += read;
	463	return read;
	464	}
	465
	466	/*
	467	* Writing to /proc/profile resets the counters
	468	*
	469	* Writing a 'profiling multiplier' value into it also re-sets the profiling
	470	* interrupt frequency, on architectures that support this.
	471	*/
	472	static ssize_t write_profile(struct file file, const char __user buf,
	473	size_t count, loff_t *ppos)
	474	{
	475	#ifdef CONFIG_SMP
	476	extern int setup_profiling_timer (unsigned int multiplier);
	477
	478	if (count == sizeof(int)) {
	479	unsigned int multiplier;
	480
	481	if (copy_from_user(&multiplier, buf, sizeof(int)))
	482	return -EFAULT;
	483
	484	if (setup_profiling_timer(multiplier))
	485	return -EINVAL;
	486	}
	487	#endif
	488	profile_discard_flip_buffers();
	489	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
	490	return count;
	491	}
	492
	493	static struct file_operations proc_profile_operations = {
	494	.read = read_profile,
	495	.write = write_profile,
	496	};
	497
	498	#ifdef CONFIG_SMP
	499	static void __init profile_nop(void *unused)
	500	{
	501	}
	502
	503	static int __init create_hash_tables(void)
	504	{
	505	int cpu;
	506
	507	for_each_online_cpu(cpu) {
	508	int node = cpu_to_node(cpu);
	509	struct page *page;
	510
	511	page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, 0);
	512	if (!page)
	513	goto out_cleanup;
	514	per_cpu(cpu_profile_hits, cpu)[1]
	515	= (struct profile_hit *)page_address(page);
	516	page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, 0);
	517	if (!page)
	518	goto out_cleanup;
	519	per_cpu(cpu_profile_hits, cpu)[0]
	520	= (struct profile_hit *)page_address(page);
	521	}
	522	return 0;
	523	out_cleanup:
	524	prof_on = 0;
	525	mb();
	526	on_each_cpu(profile_nop, NULL, 0, 1);
	527	for_each_online_cpu(cpu) {
	528	struct page *page;
	529
	530	if (per_cpu(cpu_profile_hits, cpu)[0]) {
	531	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
	532	per_cpu(cpu_profile_hits, cpu)[0] = NULL;
	533	__free_page(page);
	534	}
	535	if (per_cpu(cpu_profile_hits, cpu)[1]) {
	536	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	537	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	538	__free_page(page);
	539	}
	540	}
	541	return -1;
	542	}
	543	#else
	544	#define create_hash_tables() ({ 0; })
	545	#endif
	546
	547	static int __init create_proc_profile(void)
	548	{
	549	struct proc_dir_entry *entry;
	550
	551	if (!prof_on)
	552	return 0;
	553	if (create_hash_tables())
	554	return -1;
	555	if (!(entry = create_proc_entry("profile", S_IWUSR \| S_IRUGO, NULL)))
	556	return 0;
	557	entry->proc_fops = &proc_profile_operations;
	558	entry->size = (1+prof_len) * sizeof(atomic_t);
	559	hotcpu_notifier(profile_cpu_callback, 0);
	560	return 0;
	561	}
	562	module_init(create_proc_profile);
	563	#endif /* CONFIG_PROC_FS */