From eb39c8803d0e3d98fe74825f99287f63d55e6460 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:14:22 +0000 Subject: fadump: Reserve the memory for firmware assisted dump. Reserve the memory during early boot to preserve CPU state data, HPTE region and RMA (real mode area) region data in case of kernel crash. At the time of crash, powerpc firmware will store CPU state data, HPTE region data and move RMA region data to the reserved memory area. If the firmware-assisted dump fails to reserve the memory, then fallback to existing kexec-based kdump. Most of the code implementation to reserve memory has been adapted from phyp assisted dump implementation written by Linas Vepstas and Manish Ahuja This patch also introduces a config option CONFIG_FA_DUMP for firmware assisted dump feature on Powerpc (ppc64) architecture. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/fadump.c | 246 +++++++++++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/prom.c | 15 ++- 3 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/fadump.c (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ee728e433aa2..391bf7e1ba2f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_FA_DUMP) += fadump.o ifeq ($(CONFIG_PPC32),y) obj-$(CONFIG_E500) += idle_e500.o endif diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c new file mode 100644 index 000000000000..deb276a9ce71 --- /dev/null +++ b/arch/powerpc/kernel/fadump.c @@ -0,0 +1,246 @@ +/* + * Firmware Assisted dump: A robust mechanism to get reliable kernel crash + * dump with assistance from firmware. This approach does not use kexec, + * instead firmware assists in booting the kdump kernel while preserving + * memory contents. The most of the code implementation has been adapted + * from phyp assisted dump implementation written by Linas Vepstas and + * Manish Ahuja + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2011 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#undef DEBUG +#define pr_fmt(fmt) "fadump: " fmt + +#include +#include + +#include +#include +#include +#include + +static struct fw_dump fw_dump; + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, + const char *uname, int depth, void *data) +{ + __be32 *sections; + int i, num_sections; + unsigned long size; + const int *token; + + if (depth != 1 || strcmp(uname, "rtas") != 0) + return 0; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return 0; + + fw_dump.fadump_supported = 1; + fw_dump.ibm_configure_kernel_dump = *token; + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + if (of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL)) + fw_dump.dump_active = 1; + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives teh size of the section in bytes. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return 0; + + num_sections = size / (3 * sizeof(u32)); + + for (i = 0; i < num_sections; i++, sections += 3) { + u32 type = (u32)of_read_number(sections, 1); + + switch (type) { + case FADUMP_CPU_STATE_DATA: + fw_dump.cpu_state_data_size = + of_read_ulong(§ions[1], 2); + break; + case FADUMP_HPTE_REGION: + fw_dump.hpte_region_size = + of_read_ulong(§ions[1], 2); + break; + } + } + return 1; +} + +/** + * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM + * + * Function to find the largest memory size we need to reserve during early + * boot process. This will be the size of the memory that is required for a + * kernel to boot successfully. + * + * This function has been taken from phyp-assisted dump feature implementation. + * + * returns larger of 256MB or 5% rounded down to multiples of 256MB. + * + * TODO: Come up with better approach to find out more accurate memory size + * that is required for a kernel to boot successfully. + * + */ +static inline unsigned long fadump_calculate_reserve_size(void) +{ + unsigned long size; + + /* + * Check if the size is specified through fadump_reserve_mem= cmdline + * option. If yes, then use that. + */ + if (fw_dump.reserve_bootvar) + return fw_dump.reserve_bootvar; + + /* divide by 20 to get 5% of value */ + size = memblock_end_of_DRAM() / 20; + + /* round it down in multiples of 256 */ + size = size & ~0x0FFFFFFFUL; + + /* Truncate to memory_limit. We don't want to over reserve the memory.*/ + if (memory_limit && size > memory_limit) + size = memory_limit; + + return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); +} + +/* + * Calculate the total memory size required to be reserved for + * firmware-assisted dump registration. + */ +static unsigned long get_fadump_area_size(void) +{ + unsigned long size = 0; + + size += fw_dump.cpu_state_data_size; + size += fw_dump.hpte_region_size; + size += fw_dump.boot_memory_size; + + size = PAGE_ALIGN(size); + return size; +} + +int __init fadump_reserve_mem(void) +{ + unsigned long base, size, memory_boundary; + + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_INFO "Firmware-assisted dump is not supported on" + " this hardware\n"); + fw_dump.fadump_enabled = 0; + return 0; + } + /* Initialize boot memory size */ + fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + + /* + * Calculate the memory boundary. + * If memory_limit is less than actual memory boundary then reserve + * the memory for fadump beyond the memory_limit and adjust the + * memory_limit accordingly, so that the running kernel can run with + * specified memory_limit. + */ + if (memory_limit && memory_limit < memblock_end_of_DRAM()) { + size = get_fadump_area_size(); + if ((memory_limit + size) < memblock_end_of_DRAM()) + memory_limit += size; + else + memory_limit = memblock_end_of_DRAM(); + printk(KERN_INFO "Adjusted memory_limit for firmware-assisted" + " dump, now %#016llx\n", + (unsigned long long)memory_limit); + } + if (memory_limit) + memory_boundary = memory_limit; + else + memory_boundary = memblock_end_of_DRAM(); + + if (fw_dump.dump_active) { + printk(KERN_INFO "Firmware-assisted dump is active.\n"); + /* + * If last boot has crashed then reserve all the memory + * above boot_memory_size so that we don't touch it until + * dump is written to disk by userspace tool. This memory + * will be released for general use once the dump is saved. + */ + base = fw_dump.boot_memory_size; + size = memory_boundary - base; + memblock_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for saving crash dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + } else { + /* Reserve the memory at the top of memory. */ + size = get_fadump_area_size(); + base = memory_boundary - size; + memblock_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for firmware-assisted dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + } + fw_dump.reserve_dump_area_start = base; + fw_dump.reserve_dump_area_size = size; + return 1; +} + +/* Look for fadump= cmdline option. */ +static int __init early_fadump_param(char *p) +{ + if (!p) + return 1; + + if (strncmp(p, "on", 2) == 0) + fw_dump.fadump_enabled = 1; + else if (strncmp(p, "off", 3) == 0) + fw_dump.fadump_enabled = 0; + + return 0; +} +early_param("fadump", early_fadump_param); + +/* Look for fadump_reserve_mem= cmdline option */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index abe405dab34d..70222b35cfc5 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -55,6 +55,7 @@ #include #include #include +#include #include @@ -719,6 +720,11 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); #endif +#ifdef CONFIG_FA_DUMP + /* scan tree to see if dump is active during last boot */ + of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); +#endif + /* Pre-initialize the cmd_line with the content of boot_commmand_line, * which will be empty except when the content of the variable has * been overriden by a bootloading mechanism. This happens typically @@ -750,7 +756,14 @@ void __init early_init_devtree(void *params) if (PHYSICAL_START > MEMORY_START) memblock_reserve(MEMORY_START, 0x8000); reserve_kdump_trampoline(); - reserve_crashkernel(); +#ifdef CONFIG_FA_DUMP + /* + * If we fail to reserve memory for firmware-assisted dump then + * fallback to kexec based kdump. + */ + if (fadump_reserve_mem() == 0) +#endif + reserve_crashkernel(); early_reserve_mem(); phyp_dump_reserve_mem(); -- cgit v1.2.2 From 3ccc00a7e04ff7718c9aebb4b0c982571c798759 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 20 Feb 2012 02:15:03 +0000 Subject: fadump: Register for firmware assisted dump. On 2012-02-20 11:02:51 Mon, Paul Mackerras wrote: > On Thu, Feb 16, 2012 at 04:44:30PM +0530, Mahesh J Salgaonkar wrote: > > If I have read the code correctly, we are going to get this printk on > non-pSeries machines or on older pSeries machines, even if the user > has not put the fadump=on option on the kernel command line. The > printk will be annoying since there is no actual error condition. It > seems to me that the condition for the printk should include > fw_dump.fadump_enabled. In other words you should probably add > > if (!fw_dump.fadump_enabled) > return 0; > > at the beginning of the function. Hi Paul, Thanks for pointing it out. Please find the updated patch below. The existing patches above this (4/10 through 10/10) cleanly applies on this update. Thanks, -Mahesh. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/fadump.c | 355 ++++++++++++++++++++++++++++++++++++++++++- arch/powerpc/kernel/iommu.c | 8 +- 2 files changed, 359 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index deb276a9ce71..eb8f782afade 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -29,6 +29,9 @@ #include #include +#include +#include +#include #include #include @@ -36,6 +39,10 @@ #include static struct fw_dump fw_dump; +static struct fadump_mem_struct fdm; +static const struct fadump_mem_struct *fdm_active; + +static DEFINE_MUTEX(fadump_mutex); /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, @@ -64,7 +71,8 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, * The 'ibm,kernel-dump' rtas node is present only if there is * dump data waiting for us. */ - if (of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL)) + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) fw_dump.dump_active = 1; /* Get the sizes required to store dump data for the firmware provided @@ -98,6 +106,85 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, return 1; } +int is_fadump_active(void) +{ + return fw_dump.dump_active; +} + +/* Print firmware assisted dump configurations for debugging purpose. */ +static void fadump_show_config(void) +{ + pr_debug("Support for firmware-assisted dump (fadump): %s\n", + (fw_dump.fadump_supported ? "present" : "no support")); + + if (!fw_dump.fadump_supported) + return; + + pr_debug("Fadump enabled : %s\n", + (fw_dump.fadump_enabled ? "yes" : "no")); + pr_debug("Dump Active : %s\n", + (fw_dump.dump_active ? "yes" : "no")); + pr_debug("Dump section sizes:\n"); + pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); + pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); + pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); +} + +static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, + unsigned long addr) +{ + if (!fdm) + return 0; + + memset(fdm, 0, sizeof(struct fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm->header.dump_format_version = 0x00000001; + fdm->header.dump_num_sections = 3; + fdm->header.dump_status_flag = 0; + fdm->header.offset_first_dump_section = + (u32)offsetof(struct fadump_mem_struct, cpu_state_data); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm->header.dd_block_size = 0; + fdm->header.dd_block_offset = 0; + fdm->header.dd_num_blocks = 0; + fdm->header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm->header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG; + fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA; + fdm->cpu_state_data.source_address = 0; + fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size; + fdm->cpu_state_data.destination_address = addr; + addr += fw_dump.cpu_state_data_size; + + /* hpte region section */ + fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION; + fdm->hpte_region.source_address = 0; + fdm->hpte_region.source_len = fw_dump.hpte_region_size; + fdm->hpte_region.destination_address = addr; + addr += fw_dump.hpte_region_size; + + /* RMA region section */ + fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION; + fdm->rmr_region.source_address = RMA_START; + fdm->rmr_region.source_len = fw_dump.boot_memory_size; + fdm->rmr_region.destination_address = addr; + addr += fw_dump.boot_memory_size; + + return addr; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -166,8 +253,15 @@ int __init fadump_reserve_mem(void) fw_dump.fadump_enabled = 0; return 0; } - /* Initialize boot memory size */ - fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + /* + * Initialize boot memory size + * If dump is active then we have already calculated the size during + * first kernel. + */ + if (fdm_active) + fw_dump.boot_memory_size = fdm_active->rmr_region.source_len; + else + fw_dump.boot_memory_size = fadump_calculate_reserve_size(); /* * Calculate the memory boundary. @@ -244,3 +338,258 @@ static int __init early_fadump_reserve_mem(char *p) return 0; } early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static void register_fw_dump(struct fadump_mem_struct *fdm) +{ + int rc; + unsigned int wait_time; + + pr_debug("Registering for firmware-assisted kernel dump...\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_REGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case -1: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Hardware Error(%d).\n", rc); + break; + case -3: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Parameter Error(%d).\n", rc); + break; + case -9: + printk(KERN_ERR "firmware-assisted kernel dump is already " + " registered."); + fw_dump.dump_registered = 1; + break; + case 0: + printk(KERN_INFO "firmware-assisted kernel dump registration" + " is successful\n"); + fw_dump.dump_registered = 1; + break; + } +} + +static void register_fadump(void) +{ + /* + * If no memory is reserved then we can not register for firmware- + * assisted dump. + */ + if (!fw_dump.reserve_dump_area_size) + return; + + /* register the future kernel dump with firmware. */ + register_fw_dump(&fdm); +} + +static int fadump_unregister_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Un-register firmware-assisted dump\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_UNREGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to un-register firmware-assisted dump." + " unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_registered = 0; + return 0; +} + +static ssize_t fadump_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.fadump_enabled); +} + +static ssize_t fadump_register_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.dump_registered); +} + +static ssize_t fadump_register_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret = 0; + + if (!fw_dump.fadump_enabled || fdm_active) + return -EPERM; + + mutex_lock(&fadump_mutex); + + switch (buf[0]) { + case '0': + if (fw_dump.dump_registered == 0) { + ret = -EINVAL; + goto unlock_out; + } + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); + break; + case '1': + if (fw_dump.dump_registered == 1) { + ret = -EINVAL; + goto unlock_out; + } + /* Register Firmware-assisted dump */ + register_fadump(); + break; + default: + ret = -EINVAL; + break; + } + +unlock_out: + mutex_unlock(&fadump_mutex); + return ret < 0 ? ret : count; +} + +static int fadump_region_show(struct seq_file *m, void *private) +{ + const struct fadump_mem_struct *fdm_ptr; + + if (!fw_dump.fadump_enabled) + return 0; + + if (fdm_active) + fdm_ptr = fdm_active; + else + fdm_ptr = &fdm; + + seq_printf(m, + "CPU : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->cpu_state_data.destination_address, + fdm_ptr->cpu_state_data.destination_address + + fdm_ptr->cpu_state_data.source_len - 1, + fdm_ptr->cpu_state_data.source_len, + fdm_ptr->cpu_state_data.bytes_dumped); + seq_printf(m, + "HPTE: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->hpte_region.destination_address, + fdm_ptr->hpte_region.destination_address + + fdm_ptr->hpte_region.source_len - 1, + fdm_ptr->hpte_region.source_len, + fdm_ptr->hpte_region.bytes_dumped); + seq_printf(m, + "DUMP: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->rmr_region.destination_address, + fdm_ptr->rmr_region.destination_address + + fdm_ptr->rmr_region.source_len - 1, + fdm_ptr->rmr_region.source_len, + fdm_ptr->rmr_region.bytes_dumped); + + if (!fdm_active || + (fw_dump.reserve_dump_area_start == + fdm_ptr->cpu_state_data.destination_address)) + return 0; + + /* Dump is active. Show reserved memory region. */ + seq_printf(m, + " : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + (unsigned long long)fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - 1, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start); + return 0; +} + +static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled, + 0444, fadump_enabled_show, + NULL); +static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered, + 0644, fadump_register_show, + fadump_register_store); + +static int fadump_region_open(struct inode *inode, struct file *file) +{ + return single_open(file, fadump_region_show, inode->i_private); +} + +static const struct file_operations fadump_region_fops = { + .open = fadump_region_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void fadump_init_files(void) +{ + struct dentry *debugfs_file; + int rc = 0; + + rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_enabled (%d)\n", rc); + + rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_registered (%d)\n", rc); + + debugfs_file = debugfs_create_file("fadump_region", 0444, + powerpc_debugfs_root, NULL, + &fadump_region_fops); + if (!debugfs_file) + printk(KERN_ERR "fadump: unable to create debugfs file" + " fadump_region\n"); + return; +} + +/* + * Prepare for firmware-assisted dump. + */ +int __init setup_fadump(void) +{ + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_ERR "Firmware-assisted dump is not supported on" + " this hardware\n"); + return 0; + } + + fadump_show_config(); + /* Initialize the kernel dump memory structure for FAD registration. */ + if (fw_dump.reserve_dump_area_size) + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fadump_init_files(); + + return 1; +} +subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0cfcf98aafca..359f078571c7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -39,6 +39,7 @@ #include #include #include +#include #define DBG(...) @@ -445,7 +446,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, static void iommu_table_clear(struct iommu_table *tbl) { - if (!is_kdump_kernel()) { + /* + * In case of firmware assisted dump system goes through clean + * reboot process at the time of system crash. Hence it's safe to + * clear the TCE entries if firmware assisted dump is active. + */ + if (!is_kdump_kernel() || is_fadump_active()) { /* Clear the table in case firmware left allocations in it */ ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); return; -- cgit v1.2.2 From 2df173d9e85d9e2c6a8933c63f0c034accff7e0f Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:14:37 +0000 Subject: fadump: Initialize elfcore header and add PT_LOAD program headers. Build the crash memory range list by traversing through system memory during the first kernel before we register for firmware-assisted dump. After the successful dump registration, initialize the elfcore header and populate PT_LOAD program headers with crash memory ranges. The elfcore header is saved in the scratch area within the reserved memory. The scratch area starts at the end of the memory reserved for saving RMR region contents. The scratch area contains fadump crash info structure that contains magic number for fadump validation and physical address where the eflcore header can be found. This structure will also be used to pass some important crash info data to the second kernel which will help second kernel to populate ELF core header with correct data before it gets exported through /proc/vmcore. Since the firmware preserves the entire partition memory at the time of crash the contents of the scratch area will be preserved till second kernel boot. Since the memory dump exported through /proc/vmcore is in ELF format similar to kdump, it will help us to reuse the kdump infrastructure for dump capture and filtering. Unlike phyp dump, userspace tool does not need to refer any sysfs interface while reading /proc/vmcore. NOTE: The current design implementation does not address a possibility of introducing additional fields (in future) to this structure without affecting compatibility. It's on TODO list to come up with better approach to address this. Reserved dump area start => +-------------------------------------+ | CPU state dump data | +-------------------------------------+ | HPTE region data | +-------------------------------------+ | RMR region data | Scratch area start => +-------------------------------------+ | fadump crash info structure { | | magic nummber | +------|---- elfcorehdr_addr | | | } | +----> +-------------------------------------+ | ELF core header | Reserved dump area end => +-------------------------------------+ Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/fadump.c | 233 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 232 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index eb8f782afade..63857e183de5 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,8 @@ static struct fadump_mem_struct fdm; static const struct fadump_mem_struct *fdm_active; static DEFINE_MUTEX(fadump_mutex); +struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; +int crash_mem_ranges; /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, @@ -235,6 +238,10 @@ static unsigned long get_fadump_area_size(void) size += fw_dump.cpu_state_data_size; size += fw_dump.hpte_region_size; size += fw_dump.boot_memory_size; + size += sizeof(struct fadump_crash_info_header); + size += sizeof(struct elfhdr); /* ELF core header.*/ + /* Program headers for crash memory regions. */ + size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); size = PAGE_ALIGN(size); return size; @@ -300,6 +307,12 @@ int __init fadump_reserve_mem(void) "for saving crash dump\n", (unsigned long)(size >> 20), (unsigned long)(base >> 20)); + + fw_dump.fadumphdr_addr = + fdm_active->rmr_region.destination_address + + fdm_active->rmr_region.source_len; + pr_debug("fadumphdr_addr = %p\n", + (void *) fw_dump.fadumphdr_addr); } else { /* Reserve the memory at the top of memory. */ size = get_fadump_area_size(); @@ -380,8 +393,210 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } } +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +{ + struct fadump_crash_info_header *fdh; + + if (!fdm_active || !fw_dump.fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) || + (fdm_active->rmr_region.error_flags != 0)) { + printk(KERN_ERR "Dump taken by platform is not valid\n"); + return -EINVAL; + } + if (fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) { + printk(KERN_ERR "Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fw_dump.fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + printk(KERN_ERR "Crash info header is not valid.\n"); + return -EINVAL; + } + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static inline void fadump_add_crash_memory(unsigned long long base, + unsigned long long end) +{ + if (base == end) + return; + + pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + crash_mem_ranges, base, end - 1, (end - base)); + crash_memory_ranges[crash_mem_ranges].base = base; + crash_memory_ranges[crash_mem_ranges].size = end - base; + crash_mem_ranges++; +} + +static void fadump_exclude_reserved_area(unsigned long long start, + unsigned long long end) +{ + unsigned long long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + if ((ra_start < end) && (ra_end > start)) { + if ((start < ra_start) && (end > ra_end)) { + fadump_add_crash_memory(start, ra_start); + fadump_add_crash_memory(ra_end, end); + } else if (start < ra_start) { + fadump_add_crash_memory(start, ra_start); + } else if (ra_end < end) { + fadump_add_crash_memory(ra_end, end); + } + } else + fadump_add_crash_memory(start, end); +} + +static int fadump_init_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + return 0; +} + +/* + * Traverse through memblock structure and setup crash memory ranges. These + * ranges will be used create PT_LOAD program headers in elfcore header. + */ +static void fadump_setup_crash_memory_ranges(void) +{ + struct memblock_region *reg; + unsigned long long start, end; + + pr_debug("Setup crash memory ranges.\n"); + crash_mem_ranges = 0; + /* + * add the first memory chunk (RMA_START through boot_memory_size) as + * a separate memory chunk. The reason is, at the time crash firmware + * will move the content of this memory chunk to different location + * specified during fadump registration. We need to create a separate + * program header for this chunk with the correct offset. + */ + fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + + for_each_memblock(memory, reg) { + start = (unsigned long long)reg->base; + end = start + (unsigned long long)reg->size; + if (start == RMA_START && end >= fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + + /* add this range excluding the reserved dump area. */ + fadump_exclude_reserved_area(start, end); + } +} + +static int fadump_create_elfcore_headers(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + int i; + + fadump_init_elfcore_header(bufp); + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* setup PT_LOAD sections. */ + + for (i = 0; i < crash_mem_ranges; i++) { + unsigned long long mbase, msize; + mbase = crash_memory_ranges[i].base; + msize = crash_memory_ranges[i].size; + + if (!msize) + continue; + + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mbase; + + if (mbase == RMA_START) { + /* + * The entire RMA region will be moved by firmware + * to the specified destination_address. Hence set + * the correct offset. + */ + phdr->p_offset = fdm.rmr_region.destination_address; + } + + phdr->p_paddr = mbase; + phdr->p_vaddr = (unsigned long)__va(mbase); + phdr->p_filesz = msize; + phdr->p_memsz = msize; + phdr->p_align = 0; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + } + return 0; +} + +static unsigned long init_fadump_header(unsigned long addr) +{ + struct fadump_crash_info_header *fdh; + + if (!addr) + return 0; + + fw_dump.fadumphdr_addr = addr; + fdh = __va(addr); + addr += sizeof(struct fadump_crash_info_header); + + memset(fdh, 0, sizeof(struct fadump_crash_info_header)); + fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; + fdh->elfcorehdr_addr = addr; + + return addr; +} + static void register_fadump(void) { + unsigned long addr; + void *vaddr; + /* * If no memory is reserved then we can not register for firmware- * assisted dump. @@ -389,6 +604,16 @@ static void register_fadump(void) if (!fw_dump.reserve_dump_area_size) return; + fadump_setup_crash_memory_ranges(); + + addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len; + /* Initialize fadump crash info header. */ + addr = init_fadump_header(addr); + vaddr = __va(addr); + + pr_debug("Creating ELF core headers at %#016lx\n", addr); + fadump_create_elfcore_headers(vaddr); + /* register the future kernel dump with firmware. */ register_fw_dump(&fdm); } @@ -585,8 +810,14 @@ int __init setup_fadump(void) } fadump_show_config(); + /* + * If dump data is available then see if it is valid and prepare for + * saving it to the disk. + */ + if (fw_dump.dump_active) + process_fadump(fdm_active); /* Initialize the kernel dump memory structure for FAD registration. */ - if (fw_dump.reserve_dump_area_size) + else if (fw_dump.reserve_dump_area_size) init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); fadump_init_files(); -- cgit v1.2.2 From ebaeb5ae24379b5b635dc1d1fa6df904bc95b4d9 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:14:45 +0000 Subject: fadump: Convert firmware-assisted cpu state dump data into elf notes. When registered for firmware assisted dump on powerpc, firmware preserves the registers for the active CPUs during a system crash. This patch reads the cpu register data stored in Firmware-assisted dump format (except for crashing cpu) and converts it into elf notes and updates the PT_NOTE program header accordingly. The exact register state for crashing cpu is saved to fadump crash info structure in scratch area during crash_fadump() and read during second kernel boot. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/fadump.c | 314 ++++++++++++++++++++++++++++++++++++- arch/powerpc/kernel/setup-common.c | 6 + arch/powerpc/kernel/traps.c | 3 + 3 files changed, 321 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 63857e183de5..da68bdad194a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -240,6 +240,7 @@ static unsigned long get_fadump_area_size(void) size += fw_dump.boot_memory_size; size += sizeof(struct fadump_crash_info_header); size += sizeof(struct elfhdr); /* ELF core header.*/ + size += sizeof(struct elf_phdr); /* place holder for cpu notes */ /* Program headers for crash memory regions. */ size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); @@ -393,6 +394,285 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } } +void crash_fadump(struct pt_regs *regs, const char *str) +{ + struct fadump_crash_info_header *fdh = NULL; + + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return; + + fdh = __va(fw_dump.fadumphdr_addr); + crashing_cpu = smp_processor_id(); + fdh->crashing_cpu = crashing_cpu; + crash_save_vmcoreinfo(); + + if (regs) + fdh->regs = *regs; + else + ppc_save_regs(&fdh->regs); + + fdh->cpu_online_mask = *cpu_online_mask; + + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)str); +} + +#define GPR_MASK 0xffffff0000000000 +static inline int fadump_gpr_index(u64 id) +{ + int i = -1; + char str[3]; + + if ((id & GPR_MASK) == REG_ID("GPR")) { + /* get the digits at the end */ + id &= ~GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + sscanf(str, "%d", &i); + if (i > 31) + i = -1; + } + return i; +} + +static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, + u64 reg_val) +{ + int i; + + i = fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == REG_ID("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == REG_ID("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == REG_ID("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == REG_ID("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == REG_ID("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == REG_ID("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == REG_ID("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == REG_ID("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct fadump_reg_entry* +fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (reg_entry->reg_id != REG_ID("CPUEND")) { + fadump_set_regval(regs, reg_entry->reg_id, + reg_entry->reg_value); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, + void *data, size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void fadump_final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +{ + struct elf_prstatus prstatus; + + memset(&prstatus, 0, sizeof(prstatus)); + /* + * FIXME: How do i get PID? Do I really need it? + * prstatus.pr_pid = ???? + */ + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + return buf; +} + +static void fadump_update_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* First note is a place holder for cpu notes info. */ + phdr = (struct elf_phdr *)bufp; + + if (phdr->p_type == PT_NOTE) { + phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_offset = phdr->p_paddr; + phdr->p_filesz = fw_dump.cpu_notes_buf_size; + phdr->p_memsz = fw_dump.cpu_notes_buf_size; + } + return; +} + +static void *fadump_cpu_notes_buf_alloc(unsigned long size) +{ + void *vaddr; + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (!vaddr) + return NULL; + + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + SetPageReserved(page + i); + return vaddr; +} + +static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +{ + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +{ + struct fadump_reg_save_area_header *reg_header; + struct fadump_reg_entry *reg_entry; + struct fadump_crash_info_header *fdh = NULL; + void *vaddr; + unsigned long addr; + u32 num_cpus, *note_buf; + struct pt_regs regs; + int i, rc = 0, cpu = 0; + + if (!fdm->cpu_state_data.bytes_dumped) + return -EINVAL; + + addr = fdm->cpu_state_data.destination_address; + vaddr = __va(addr); + + reg_header = vaddr; + if (reg_header->magic_number != REGSAVE_AREA_MAGIC) { + printk(KERN_ERR "Unable to read register save area.\n"); + return -ENOENT; + } + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", reg_header->magic_number); + pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset); + + vaddr += reg_header->num_cpu_offset; + num_cpus = *((u32 *)(vaddr)); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct fadump_reg_entry *)vaddr; + + /* Allocate buffer to hold cpu crash notes. */ + fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); + fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); + note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); + if (!note_buf) { + printk(KERN_ERR "Failed to allocate 0x%lx bytes for " + "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); + return -ENOMEM; + } + fw_dump.cpu_notes_buf = __pa(note_buf); + + pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", + (num_cpus * sizeof(note_buf_t)), note_buf); + + if (fw_dump.fadumphdr_addr) + fdh = __va(fw_dump.fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (reg_entry->reg_id != REG_ID("CPUSTRT")) { + printk(KERN_ERR "Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK; + if (!cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { + SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = fadump_read_registers(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + fadump_final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); + return 0; + +error_out: + fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + return rc; + +} + /* * Validate and process the dump data stored by firmware before exporting * it through '/proc/vmcore'. @@ -400,18 +680,21 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) static int __init process_fadump(const struct fadump_mem_struct *fdm_active) { struct fadump_crash_info_header *fdh; + int rc = 0; if (!fdm_active || !fw_dump.fadumphdr_addr) return -EINVAL; /* Check if the dump data is valid. */ if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || (fdm_active->rmr_region.error_flags != 0)) { printk(KERN_ERR "Dump taken by platform is not valid\n"); return -EINVAL; } - if (fdm_active->rmr_region.bytes_dumped != - fdm_active->rmr_region.source_len) { + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { printk(KERN_ERR "Dump taken by platform is incomplete\n"); return -EINVAL; } @@ -423,6 +706,10 @@ static int __init process_fadump(const struct fadump_mem_struct *fdm_active) return -EINVAL; } + rc = fadump_build_cpu_notes(fdm_active); + if (rc) + return rc; + /* * We are done validating dump info and elfcore header is now ready * to be exported. set elfcorehdr_addr so that vmcore module will @@ -537,6 +824,27 @@ static int fadump_create_elfcore_headers(char *bufp) elf = (struct elfhdr *)bufp; bufp += sizeof(struct elfhdr); + /* + * setup ELF PT_NOTE, place holder for cpu notes info. The notes info + * will be populated during second kernel boot after crash. Hence + * this PT_NOTE will always be the first elf note. + * + * NOTE: Any new ELF note addition should be placed after this note. + */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_offset = 0; + phdr->p_paddr = 0; + phdr->p_filesz = 0; + phdr->p_memsz = 0; + + (elf->e_phnum)++; + /* setup PT_LOAD sections. */ for (i = 0; i < crash_mem_ranges; i++) { @@ -588,6 +896,8 @@ static unsigned long init_fadump_header(unsigned long addr) memset(fdh, 0, sizeof(struct fadump_crash_info_header)); fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; fdh->elfcorehdr_addr = addr; + /* We will set the crashing cpu id in crash_fadump() during crash. */ + fdh->crashing_cpu = CPU_UNKNOWN; return addr; } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 77bb77da05c1..4e62a56e1a95 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -61,6 +61,7 @@ #include #include #include +#include #include "setup.h" @@ -639,6 +640,11 @@ EXPORT_SYMBOL(check_legacy_ioport); static int ppc_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything else. + */ + crash_fadump(NULL, ptr); ppc_md.panic(ptr); /* May not return */ return NOTIFY_DONE; } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c091527efd89..5d40e592ffcb 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -57,6 +57,7 @@ #include #include #include +#include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) int (*__debugger)(struct pt_regs *regs) __read_mostly; @@ -145,6 +146,8 @@ static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, arch_spin_unlock(&die_lock); raw_local_irq_restore(flags); + crash_fadump(regs, "die oops"); + /* * A system reset (0x100) is a request to dump, so we always send * it through the crashdump code. -- cgit v1.2.2 From d34c5f26cf7de52a72ee064698817a5a39b91767 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:14:53 +0000 Subject: fadump: Add PT_NOTE program header for vmcoreinfo Introduce a PT_NOTE program header that points to physical address of vmcoreinfo_note buffer declared in kernel/kexec.c. The vmcoreinfo note buffer is populated during crash_fadump() at the time of system crash. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/fadump.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index da68bdad194a..a83bc9015c64 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -814,6 +814,19 @@ static void fadump_setup_crash_memory_ranges(void) } } +/* + * If the given physical address falls within the boot memory region then + * return the relocated address that points to the dump region reserved + * for saving initial boot memory contents. + */ +static inline unsigned long fadump_relocate(unsigned long paddr) +{ + if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) + return fdm.rmr_region.destination_address + paddr; + else + return paddr; +} + static int fadump_create_elfcore_headers(char *bufp) { struct elfhdr *elf; @@ -845,6 +858,22 @@ static int fadump_create_elfcore_headers(char *bufp) (elf->e_phnum)++; + /* setup ELF PT_NOTE for vmcoreinfo */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); + phdr->p_offset = phdr->p_paddr; + phdr->p_memsz = vmcoreinfo_max_size; + phdr->p_filesz = vmcoreinfo_max_size; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + /* setup PT_LOAD sections. */ for (i = 0; i < crash_mem_ranges; i++) { -- cgit v1.2.2 From b500afff11f64227ca69fd2d05986d08d9573935 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:15:08 +0000 Subject: fadump: Invalidate registration and release reserved memory for general use. This patch introduces an sysfs interface '/sys/kernel/fadump_release_mem' to invalidate the last fadump registration, invalidate '/proc/vmcore', release the reserved memory for general use and re-register for future kernel dump. Once the dump is copied to the disk, unlike phyp dump, the userspace tool can release all the memory reserved for dump with one single operation of echo 1 to '/sys/kernel/fadump_release_mem'. Release the reserved memory region excluding the size of the memory required for future kernel dump registration. And therefore, unlike kdump, Fadump doesn't need a 2nd reboot to get back the system to the production configuration. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/fadump.c | 158 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a83bc9015c64..cfe7a38708c3 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -984,6 +986,132 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm) return 0; } +static int fadump_invalidate_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Invalidating firmware-assisted dump registration\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_INVALIDATE, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to invalidate firmware-assisted dump " + "rgistration. unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_active = 0; + fdm_active = NULL; + return 0; +} + +void fadump_cleanup(void) +{ + /* Invalidate the registration only if dump is active. */ + if (fw_dump.dump_active) { + init_fadump_mem_struct(&fdm, + fdm_active->cpu_state_data.destination_address); + fadump_invalidate_dump(&fdm); + } +} + +/* + * Release the memory that was reserved in early boot to preserve the memory + * contents. The released memory will be available for general use. + */ +static void fadump_release_memory(unsigned long begin, unsigned long end) +{ + unsigned long addr; + unsigned long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) + continue; + + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +static void fadump_invalidate_release_mem(void) +{ + unsigned long reserved_area_start, reserved_area_end; + unsigned long destination_address; + + mutex_lock(&fadump_mutex); + if (!fw_dump.dump_active) { + mutex_unlock(&fadump_mutex); + return; + } + + destination_address = fdm_active->cpu_state_data.destination_address; + fadump_cleanup(); + mutex_unlock(&fadump_mutex); + + /* + * Save the current reserved memory bounds we will require them + * later for releasing the memory for general use. + */ + reserved_area_start = fw_dump.reserve_dump_area_start; + reserved_area_end = reserved_area_start + + fw_dump.reserve_dump_area_size; + /* + * Setup reserve_dump_area_start and its size so that we can + * reuse this reserved memory for Re-registration. + */ + fw_dump.reserve_dump_area_start = destination_address; + fw_dump.reserve_dump_area_size = get_fadump_area_size(); + + fadump_release_memory(reserved_area_start, reserved_area_end); + if (fw_dump.cpu_notes_buf) { + fadump_cpu_notes_buf_free( + (unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + } + /* Initialize the kernel dump memory structure for FAD registration. */ + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); +} + +static ssize_t fadump_release_memory_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!fw_dump.dump_active) + return -EPERM; + + if (buf[0] == '1') { + /* + * Take away the '/proc/vmcore'. We are releasing the dump + * memory, hence it will not be valid anymore. + */ + vmcore_cleanup(); + fadump_invalidate_release_mem(); + + } else + return -EINVAL; + return count; +} + static ssize_t fadump_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -1043,10 +1171,13 @@ static int fadump_region_show(struct seq_file *m, void *private) if (!fw_dump.fadump_enabled) return 0; + mutex_lock(&fadump_mutex); if (fdm_active) fdm_ptr = fdm_active; - else + else { + mutex_unlock(&fadump_mutex); fdm_ptr = &fdm; + } seq_printf(m, "CPU : [%#016llx-%#016llx] %#llx bytes, " @@ -1076,7 +1207,7 @@ static int fadump_region_show(struct seq_file *m, void *private) if (!fdm_active || (fw_dump.reserve_dump_area_start == fdm_ptr->cpu_state_data.destination_address)) - return 0; + goto out; /* Dump is active. Show reserved memory region. */ seq_printf(m, @@ -1088,9 +1219,15 @@ static int fadump_region_show(struct seq_file *m, void *private) fw_dump.reserve_dump_area_start, fdm_ptr->cpu_state_data.destination_address - fw_dump.reserve_dump_area_start); +out: + if (fdm_active) + mutex_unlock(&fadump_mutex); return 0; } +static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem, + 0200, NULL, + fadump_release_memory_store); static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled, 0444, fadump_enabled_show, NULL); @@ -1131,6 +1268,13 @@ static void fadump_init_files(void) if (!debugfs_file) printk(KERN_ERR "fadump: unable to create debugfs file" " fadump_region\n"); + + if (fw_dump.dump_active) { + rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_release_mem (%d)\n", rc); + } return; } @@ -1153,8 +1297,14 @@ int __init setup_fadump(void) * If dump data is available then see if it is valid and prepare for * saving it to the disk. */ - if (fw_dump.dump_active) - process_fadump(fdm_active); + if (fw_dump.dump_active) { + /* + * if dump process fails then invalidate the registration + * and release memory before proceeding for re-registration. + */ + if (process_fadump(fdm_active) < 0) + fadump_invalidate_release_mem(); + } /* Initialize the kernel dump memory structure for FAD registration. */ else if (fw_dump.reserve_dump_area_size) init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); -- cgit v1.2.2 From 67b43b9d7ced37a2e72e2c3e06464aa0a5be95f9 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:15:15 +0000 Subject: fadump: Invalidate the fadump registration during machine shutdown. If dump is active during system reboot, shutdown or halt then invalidate the fadump registration as it does not get invalidated automatically. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/setup-common.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4e62a56e1a95..b0ebdeab9494 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -110,6 +110,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs); /* also used by kexec */ void machine_shutdown(void) { +#ifdef CONFIG_FA_DUMP + /* + * if fadump is active, cleanup the fadump registration before we + * shutdown. + */ + fadump_cleanup(); +#endif + if (ppc_md.machine_shutdown) ppc_md.machine_shutdown(); } -- cgit v1.2.2 From 12d9299241241200e4f34f3b02f206fa8384a923 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:15:23 +0000 Subject: fadump: Remove the phyp assisted dump code. Remove the phyp assisted dump implementation which is not is use. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom.c | 87 ---------------------------------------------- 1 file changed, 87 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 70222b35cfc5..89e850af3dd6 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include @@ -616,86 +615,6 @@ static void __init early_reserve_mem(void) } } -#ifdef CONFIG_PHYP_DUMP -/** - * phyp_dump_calculate_reserve_size() - reserve variable boot area 5% or arg - * - * Function to find the largest size we need to reserve - * during early boot process. - * - * It either looks for boot param and returns that OR - * returns larger of 256 or 5% rounded down to multiples of 256MB. - * - */ -static inline unsigned long phyp_dump_calculate_reserve_size(void) -{ - unsigned long tmp; - - if (phyp_dump_info->reserve_bootvar) - return phyp_dump_info->reserve_bootvar; - - /* divide by 20 to get 5% of value */ - tmp = memblock_end_of_DRAM(); - do_div(tmp, 20); - - /* round it down in multiples of 256 */ - tmp = tmp & ~0x0FFFFFFFUL; - - return (tmp > PHYP_DUMP_RMR_END ? tmp : PHYP_DUMP_RMR_END); -} - -/** - * phyp_dump_reserve_mem() - reserve all not-yet-dumped mmemory - * - * This routine may reserve memory regions in the kernel only - * if the system is supported and a dump was taken in last - * boot instance or if the hardware is supported and the - * scratch area needs to be setup. In other instances it returns - * without reserving anything. The memory in case of dump being - * active is freed when the dump is collected (by userland tools). - */ -static void __init phyp_dump_reserve_mem(void) -{ - unsigned long base, size; - unsigned long variable_reserve_size; - - if (!phyp_dump_info->phyp_dump_configured) { - printk(KERN_ERR "Phyp-dump not supported on this hardware\n"); - return; - } - - if (!phyp_dump_info->phyp_dump_at_boot) { - printk(KERN_INFO "Phyp-dump disabled at boot time\n"); - return; - } - - variable_reserve_size = phyp_dump_calculate_reserve_size(); - - if (phyp_dump_info->phyp_dump_is_active) { - /* Reserve *everything* above RMR.Area freed by userland tools*/ - base = variable_reserve_size; - size = memblock_end_of_DRAM() - base; - - /* XXX crashed_ram_end is wrong, since it may be beyond - * the memory_limit, it will need to be adjusted. */ - memblock_reserve(base, size); - - phyp_dump_info->init_reserve_start = base; - phyp_dump_info->init_reserve_size = size; - } else { - size = phyp_dump_info->cpu_state_size + - phyp_dump_info->hpte_region_size + - variable_reserve_size; - base = memblock_end_of_DRAM() - size; - memblock_reserve(base, size); - phyp_dump_info->init_reserve_start = base; - phyp_dump_info->init_reserve_size = size; - } -} -#else -static inline void __init phyp_dump_reserve_mem(void) {} -#endif /* CONFIG_PHYP_DUMP && CONFIG_PPC_RTAS */ - void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -715,11 +634,6 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_opal, NULL); #endif -#ifdef CONFIG_PHYP_DUMP - /* scan tree to see if dump occurred during last boot */ - of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); -#endif - #ifdef CONFIG_FA_DUMP /* scan tree to see if dump is active during last boot */ of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); @@ -765,7 +679,6 @@ void __init early_init_devtree(void *params) #endif reserve_crashkernel(); early_reserve_mem(); - phyp_dump_reserve_mem(); /* * Ensure that total memory size is page-aligned, because otherwise -- cgit v1.2.2 From f2699491e06584a2ebb0939f108ad29f3b151456 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 20 Feb 2012 17:02:09 +0000 Subject: powerpc/perf: Move perf core & PMU code into a subdirectory The perf code has grown a lot since it started, and is big enough to warrant its own subdirectory. For reference it's ~60% bigger than the oprofile code. It declutters the kernel directory, makes it simpler to grep for "just perf stuff", and allows us to shorten some filenames. While we're at it, make it more obvious that we have two implementations of the core perf logic. One for (roughly) Book3S CPUs, which was the original implementation, and the other for Freescale embedded CPUs. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/Makefile | 9 - arch/powerpc/kernel/e500-pmu.c | 134 --- arch/powerpc/kernel/mpc7450-pmu.c | 422 --------- arch/powerpc/kernel/perf_callchain.c | 492 ---------- arch/powerpc/kernel/perf_event.c | 1438 ------------------------------ arch/powerpc/kernel/perf_event_fsl_emb.c | 688 -------------- arch/powerpc/kernel/power4-pmu.c | 621 ------------- arch/powerpc/kernel/power5+-pmu.c | 690 -------------- arch/powerpc/kernel/power5-pmu.c | 629 ------------- arch/powerpc/kernel/power6-pmu.c | 552 ------------ arch/powerpc/kernel/power7-pmu.c | 379 -------- arch/powerpc/kernel/ppc970-pmu.c | 502 ----------- 12 files changed, 6556 deletions(-) delete mode 100644 arch/powerpc/kernel/e500-pmu.c delete mode 100644 arch/powerpc/kernel/mpc7450-pmu.c delete mode 100644 arch/powerpc/kernel/perf_callchain.c delete mode 100644 arch/powerpc/kernel/perf_event.c delete mode 100644 arch/powerpc/kernel/perf_event_fsl_emb.c delete mode 100644 arch/powerpc/kernel/power4-pmu.c delete mode 100644 arch/powerpc/kernel/power5+-pmu.c delete mode 100644 arch/powerpc/kernel/power5-pmu.c delete mode 100644 arch/powerpc/kernel/power6-pmu.c delete mode 100644 arch/powerpc/kernel/power7-pmu.c delete mode 100644 arch/powerpc/kernel/ppc970-pmu.c (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 391bf7e1ba2f..f5808a35688c 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -114,15 +114,6 @@ obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o -obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o - -obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o -obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ - power5+-pmu.o power6-pmu.o power7-pmu.o -obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o - -obj-$(CONFIG_FSL_EMB_PERF_EVENT) += perf_event_fsl_emb.o -obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c deleted file mode 100644 index cb2e2949c8d1..000000000000 --- a/arch/powerpc/kernel/e500-pmu.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Performance counter support for e500 family processors. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * Copyright 2010 Freescale Semiconductor, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include - -/* - * Map of generic hardware event types to hardware events - * Zero if unsupported - */ -static int e500_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 1, - [PERF_COUNT_HW_INSTRUCTIONS] = 2, - [PERF_COUNT_HW_CACHE_MISSES] = 41, /* Data L1 cache reloads */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 12, - [PERF_COUNT_HW_BRANCH_MISSES] = 15, -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - /* - * D-cache misses are not split into read/write/prefetch; - * use raw event 41. - */ - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 27, 0 }, - [C(OP_WRITE)] = { 28, 0 }, - [C(OP_PREFETCH)] = { 29, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 2, 60 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - /* - * Assuming LL means L2, it's not a good match for this model. - * It allocates only on L1 castout or explicit prefetch, and - * does not have separate read/write events (but it does have - * separate instruction/data events). - */ - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { 0, 0 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - /* - * There are data/instruction MMU misses, but that's a miss on - * the chip's internal level-one TLB which is probably not - * what the user wants. Instead, unified level-two TLB misses - * are reported here. - */ - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 26, 66 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 12, 15 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static int num_events = 128; - -/* Upper half of event id is PMLCb, for threshold events */ -static u64 e500_xlate_event(u64 event_id) -{ - u32 event_low = (u32)event_id; - u64 ret; - - if (event_low >= num_events) - return 0; - - ret = FSL_EMB_EVENT_VALID; - - if (event_low >= 76 && event_low <= 81) { - ret |= FSL_EMB_EVENT_RESTRICTED; - ret |= event_id & - (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH); - } else if (event_id & - (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH)) { - /* Threshold requested on non-threshold event */ - return 0; - } - - return ret; -} - -static struct fsl_emb_pmu e500_pmu = { - .name = "e500 family", - .n_counter = 4, - .n_restricted = 2, - .xlate_event = e500_xlate_event, - .n_generic = ARRAY_SIZE(e500_generic_events), - .generic_events = e500_generic_events, - .cache_events = &e500_cache_events, -}; - -static int init_e500_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type) - return -ENODEV; - - if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500mc")) - num_events = 256; - else if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500")) - return -ENODEV; - - return register_fsl_emb_pmu(&e500_pmu); -} - -early_initcall(init_e500_pmu); diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c deleted file mode 100644 index fe21b515ca44..000000000000 --- a/arch/powerpc/kernel/mpc7450-pmu.c +++ /dev/null @@ -1,422 +0,0 @@ -/* - * Performance counter support for MPC7450-family processors. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include - -#define N_COUNTER 6 /* Number of hardware counters */ -#define MAX_ALT 3 /* Maximum number of event alternative codes */ - -/* - * Bits in event code for MPC7450 family - */ -#define PM_THRMULT_MSKS 0x40000 -#define PM_THRESH_SH 12 -#define PM_THRESH_MSK 0x3f -#define PM_PMC_SH 8 -#define PM_PMC_MSK 7 -#define PM_PMCSEL_MSK 0x7f - -/* - * Classify events according to how specific their PMC requirements are. - * Result is: - * 0: can go on any PMC - * 1: can go on PMCs 1-4 - * 2: can go on PMCs 1,2,4 - * 3: can go on PMCs 1 or 2 - * 4: can only go on one PMC - * -1: event code is invalid - */ -#define N_CLASSES 5 - -static int mpc7450_classify_event(u32 event) -{ - int pmc; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > N_COUNTER) - return -1; - return 4; - } - event &= PM_PMCSEL_MSK; - if (event <= 1) - return 0; - if (event <= 7) - return 1; - if (event <= 13) - return 2; - if (event <= 22) - return 3; - return -1; -} - -/* - * Events using threshold and possible threshold scale: - * code scale? name - * 11e N PM_INSTQ_EXCEED_CYC - * 11f N PM_ALTV_IQ_EXCEED_CYC - * 128 Y PM_DTLB_SEARCH_EXCEED_CYC - * 12b Y PM_LD_MISS_EXCEED_L1_CYC - * 220 N PM_CQ_EXCEED_CYC - * 30c N PM_GPR_RB_EXCEED_CYC - * 30d ? PM_FPR_IQ_EXCEED_CYC ? - * 311 Y PM_ITLB_SEARCH_EXCEED - * 410 N PM_GPR_IQ_EXCEED_CYC - */ - -/* - * Return use of threshold and threshold scale bits: - * 0 = uses neither, 1 = uses threshold, 2 = uses both - */ -static int mpc7450_threshold_use(u32 event) -{ - int pmc, sel; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - sel = event & PM_PMCSEL_MSK; - switch (pmc) { - case 1: - if (sel == 0x1e || sel == 0x1f) - return 1; - if (sel == 0x28 || sel == 0x2b) - return 2; - break; - case 2: - if (sel == 0x20) - return 1; - break; - case 3: - if (sel == 0xc || sel == 0xd) - return 1; - if (sel == 0x11) - return 2; - break; - case 4: - if (sel == 0x10) - return 1; - break; - } - return 0; -} - -/* - * Layout of constraint bits: - * 33222222222211111111110000000000 - * 10987654321098765432109876543210 - * |< >< > < > < ><><><><><><> - * TS TV G4 G3 G2P6P5P4P3P2P1 - * - * P1 - P6 - * 0 - 11: Count of events needing PMC1 .. PMC6 - * - * G2 - * 12 - 14: Count of events needing PMC1 or PMC2 - * - * G3 - * 16 - 18: Count of events needing PMC1, PMC2 or PMC4 - * - * G4 - * 20 - 23: Count of events needing PMC1, PMC2, PMC3 or PMC4 - * - * TV - * 24 - 29: Threshold value requested - * - * TS - * 30: Threshold scale value requested - */ - -static u32 pmcbits[N_COUNTER][2] = { - { 0x00844002, 0x00111001 }, /* PMC1 mask, value: P1,G2,G3,G4 */ - { 0x00844008, 0x00111004 }, /* PMC2: P2,G2,G3,G4 */ - { 0x00800020, 0x00100010 }, /* PMC3: P3,G4 */ - { 0x00840080, 0x00110040 }, /* PMC4: P4,G3,G4 */ - { 0x00000200, 0x00000100 }, /* PMC5: P5 */ - { 0x00000800, 0x00000400 } /* PMC6: P6 */ -}; - -static u32 classbits[N_CLASSES - 1][2] = { - { 0x00000000, 0x00000000 }, /* class 0: no constraint */ - { 0x00800000, 0x00100000 }, /* class 1: G4 */ - { 0x00040000, 0x00010000 }, /* class 2: G3 */ - { 0x00004000, 0x00001000 }, /* class 3: G2 */ -}; - -static int mpc7450_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, class; - u32 mask, value; - int thresh, tuse; - - class = mpc7450_classify_event(event); - if (class < 0) - return -1; - if (class == 4) { - pmc = ((unsigned int)event >> PM_PMC_SH) & PM_PMC_MSK; - mask = pmcbits[pmc - 1][0]; - value = pmcbits[pmc - 1][1]; - } else { - mask = classbits[class][0]; - value = classbits[class][1]; - } - - tuse = mpc7450_threshold_use(event); - if (tuse) { - thresh = ((unsigned int)event >> PM_THRESH_SH) & PM_THRESH_MSK; - mask |= 0x3f << 24; - value |= thresh << 24; - if (tuse == 2) { - mask |= 0x40000000; - if ((unsigned int)event & PM_THRMULT_MSKS) - value |= 0x40000000; - } - } - - *maskp = mask; - *valp = value; - return 0; -} - -static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x217, 0x317 }, /* PM_L1_DCACHE_MISS */ - { 0x418, 0x50f, 0x60f }, /* PM_SNOOP_RETRY */ - { 0x502, 0x602 }, /* PM_L2_HIT */ - { 0x503, 0x603 }, /* PM_L3_HIT */ - { 0x504, 0x604 }, /* PM_L2_ICACHE_MISS */ - { 0x505, 0x605 }, /* PM_L3_ICACHE_MISS */ - { 0x506, 0x606 }, /* PM_L2_DCACHE_MISS */ - { 0x507, 0x607 }, /* PM_L3_DCACHE_MISS */ - { 0x50a, 0x623 }, /* PM_LD_HIT_L3 */ - { 0x50b, 0x624 }, /* PM_ST_HIT_L3 */ - { 0x50d, 0x60d }, /* PM_L2_TOUCH_HIT */ - { 0x50e, 0x60e }, /* PM_L3_TOUCH_HIT */ - { 0x512, 0x612 }, /* PM_INT_LOCAL */ - { 0x513, 0x61d }, /* PM_L2_MISS */ - { 0x514, 0x61e }, /* PM_L3_MISS */ -}; - -/* - * Scan the alternatives table for a match and return the - * index into the alternatives table if found, else -1. - */ -static int find_alternative(u32 event) -{ - int i, j; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - break; - for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) - if (event == event_alternatives[i][j]) - return i; - } - return -1; -} - -static int mpc7450_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - int i, j, nalt = 1; - u32 ae; - - alt[0] = event; - nalt = 1; - i = find_alternative((u32)event); - if (i >= 0) { - for (j = 0; j < MAX_ALT; ++j) { - ae = event_alternatives[i][j]; - if (ae && ae != (u32)event) - alt[nalt++] = ae; - } - } - return nalt; -} - -/* - * Bitmaps of which PMCs each class can use for classes 0 - 3. - * Bit i is set if PMC i+1 is usable. - */ -static const u8 classmap[N_CLASSES] = { - 0x3f, 0x0f, 0x0b, 0x03, 0 -}; - -/* Bit position and width of each PMCSEL field */ -static const int pmcsel_shift[N_COUNTER] = { - 6, 0, 27, 22, 17, 11 -}; -static const u32 pmcsel_mask[N_COUNTER] = { - 0x7f, 0x3f, 0x1f, 0x1f, 0x1f, 0x3f -}; - -/* - * Compute MMCR0/1/2 values for a set of events. - */ -static int mpc7450_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - u8 event_index[N_CLASSES][N_COUNTER]; - int n_classevent[N_CLASSES]; - int i, j, class, tuse; - u32 pmc_inuse = 0, pmc_avail; - u32 mmcr0 = 0, mmcr1 = 0, mmcr2 = 0; - u32 ev, pmc, thresh; - - if (n_ev > N_COUNTER) - return -1; - - /* First pass: count usage in each class */ - for (i = 0; i < N_CLASSES; ++i) - n_classevent[i] = 0; - for (i = 0; i < n_ev; ++i) { - class = mpc7450_classify_event(event[i]); - if (class < 0) - return -1; - j = n_classevent[class]++; - event_index[class][j] = i; - } - - /* Second pass: allocate PMCs from most specific event to least */ - for (class = N_CLASSES - 1; class >= 0; --class) { - for (i = 0; i < n_classevent[class]; ++i) { - ev = event[event_index[class][i]]; - if (class == 4) { - pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc_inuse & (1 << (pmc - 1))) - return -1; - } else { - /* Find a suitable PMC */ - pmc_avail = classmap[class] & ~pmc_inuse; - if (!pmc_avail) - return -1; - pmc = ffs(pmc_avail); - } - pmc_inuse |= 1 << (pmc - 1); - - tuse = mpc7450_threshold_use(ev); - if (tuse) { - thresh = (ev >> PM_THRESH_SH) & PM_THRESH_MSK; - mmcr0 |= thresh << 16; - if (tuse == 2 && (ev & PM_THRMULT_MSKS)) - mmcr2 = 0x80000000; - } - ev &= pmcsel_mask[pmc - 1]; - ev <<= pmcsel_shift[pmc - 1]; - if (pmc <= 2) - mmcr0 |= ev; - else - mmcr1 |= ev; - hwc[event_index[class][i]] = pmc - 1; - } - } - - if (pmc_inuse & 1) - mmcr0 |= MMCR0_PMC1CE; - if (pmc_inuse & 0x3e) - mmcr0 |= MMCR0_PMCnCE; - - /* Return MMCRx values */ - mmcr[0] = mmcr0; - mmcr[1] = mmcr1; - mmcr[2] = mmcr2; - return 0; -} - -/* - * Disable counting by a PMC. - * Note that the pmc argument is 0-based here, not 1-based. - */ -static void mpc7450_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - if (pmc <= 1) - mmcr[0] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]); - else - mmcr[1] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]); -} - -static int mpc7450_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 1, - [PERF_COUNT_HW_INSTRUCTIONS] = 2, - [PERF_COUNT_HW_CACHE_MISSES] = 0x217, /* PM_L1_DCACHE_MISS */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x122, /* PM_BR_CMPL */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x41c, /* PM_BR_MPRED */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x225 }, - [C(OP_WRITE)] = { 0, 0x227 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x129, 0x115 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0x634, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { 0, 0 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x312 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x223 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x122, 0x41c }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -struct power_pmu mpc7450_pmu = { - .name = "MPC7450 family", - .n_counter = N_COUNTER, - .max_alternatives = MAX_ALT, - .add_fields = 0x00111555ul, - .test_adder = 0x00301000ul, - .compute_mmcr = mpc7450_compute_mmcr, - .get_constraint = mpc7450_get_constraint, - .get_alternatives = mpc7450_get_alternatives, - .disable_pmc = mpc7450_disable_pmc, - .n_generic = ARRAY_SIZE(mpc7450_generic_events), - .generic_events = mpc7450_generic_events, - .cache_events = &mpc7450_cache_events, -}; - -static int __init init_mpc7450_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/7450")) - return -ENODEV; - - return register_power_pmu(&mpc7450_pmu); -} - -early_initcall(init_mpc7450_pmu); diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c deleted file mode 100644 index 564c1d8bdb5c..000000000000 --- a/arch/powerpc/kernel/perf_callchain.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * Performance counter callchain support - powerpc architecture code - * - * Copyright © 2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_PPC64 -#include "ppc32.h" -#endif - - -/* - * Is sp valid as the address of the next kernel stack frame after prev_sp? - * The next frame may be in a different stack area but should not go - * back down in the same stack area. - */ -static int valid_next_sp(unsigned long sp, unsigned long prev_sp) -{ - if (sp & 0xf) - return 0; /* must be 16-byte aligned */ - if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) - return 0; - if (sp >= prev_sp + STACK_FRAME_OVERHEAD) - return 1; - /* - * sp could decrease when we jump off an interrupt stack - * back to the regular process stack. - */ - if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1))) - return 1; - return 0; -} - -void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) -{ - unsigned long sp, next_sp; - unsigned long next_ip; - unsigned long lr; - long level = 0; - unsigned long *fp; - - lr = regs->link; - sp = regs->gpr[1]; - perf_callchain_store(entry, regs->nip); - - if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) - return; - - for (;;) { - fp = (unsigned long *) sp; - next_sp = fp[0]; - - if (next_sp == sp + STACK_INT_FRAME_SIZE && - fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { - /* - * This looks like an interrupt frame for an - * interrupt that occurred in the kernel - */ - regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD); - next_ip = regs->nip; - lr = regs->link; - level = 0; - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - - } else { - if (level == 0) - next_ip = lr; - else - next_ip = fp[STACK_FRAME_LR_SAVE]; - - /* - * We can't tell which of the first two addresses - * we get are valid, but we can filter out the - * obviously bogus ones here. We replace them - * with 0 rather than removing them entirely so - * that userspace can tell which is which. - */ - if ((level == 1 && next_ip == lr) || - (level <= 1 && !kernel_text_address(next_ip))) - next_ip = 0; - - ++level; - } - - perf_callchain_store(entry, next_ip); - if (!valid_next_sp(next_sp, sp)) - return; - sp = next_sp; - } -} - -#ifdef CONFIG_PPC64 -/* - * On 64-bit we don't want to invoke hash_page on user addresses from - * interrupt context, so if the access faults, we read the page tables - * to find which page (if any) is mapped and access it directly. - */ -static int read_user_stack_slow(void __user *ptr, void *ret, int nb) -{ - pgd_t *pgdir; - pte_t *ptep, pte; - unsigned shift; - unsigned long addr = (unsigned long) ptr; - unsigned long offset; - unsigned long pfn; - void *kaddr; - - pgdir = current->mm->pgd; - if (!pgdir) - return -EFAULT; - - ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); - if (!shift) - shift = PAGE_SHIFT; - - /* align address to page boundary */ - offset = addr & ((1UL << shift) - 1); - addr -= offset; - - if (ptep == NULL) - return -EFAULT; - pte = *ptep; - if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) - return -EFAULT; - pfn = pte_pfn(pte); - if (!page_is_ram(pfn)) - return -EFAULT; - - /* no highmem to worry about here */ - kaddr = pfn_to_kaddr(pfn); - memcpy(ret, kaddr + offset, nb); - return 0; -} - -static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) -{ - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) || - ((unsigned long)ptr & 7)) - return -EFAULT; - - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); - return 0; - } - pagefault_enable(); - - return read_user_stack_slow(ptr, ret, 8); -} - -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) -{ - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); - return 0; - } - pagefault_enable(); - - return read_user_stack_slow(ptr, ret, 4); -} - -static inline int valid_user_sp(unsigned long sp, int is_64) -{ - if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32) - return 0; - return 1; -} - -/* - * 64-bit user processes use the same stack frame for RT and non-RT signals. - */ -struct signal_frame_64 { - char dummy[__SIGNAL_FRAMESIZE]; - struct ucontext uc; - unsigned long unused[2]; - unsigned int tramp[6]; - struct siginfo *pinfo; - void *puc; - struct siginfo info; - char abigap[288]; -}; - -static int is_sigreturn_64_address(unsigned long nip, unsigned long fp) -{ - if (nip == fp + offsetof(struct signal_frame_64, tramp)) - return 1; - if (vdso64_rt_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso64_rt_sigtramp) - return 1; - return 0; -} - -/* - * Do some sanity checking on the signal frame pointed to by sp. - * We check the pinfo and puc pointers in the frame. - */ -static int sane_signal_64_frame(unsigned long sp) -{ - struct signal_frame_64 __user *sf; - unsigned long pinfo, puc; - - sf = (struct signal_frame_64 __user *) sp; - if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) || - read_user_stack_64((unsigned long __user *) &sf->puc, &puc)) - return 0; - return pinfo == (unsigned long) &sf->info && - puc == (unsigned long) &sf->uc; -} - -static void perf_callchain_user_64(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ - unsigned long sp, next_sp; - unsigned long next_ip; - unsigned long lr; - long level = 0; - struct signal_frame_64 __user *sigframe; - unsigned long __user *fp, *uregs; - - next_ip = regs->nip; - lr = regs->link; - sp = regs->gpr[1]; - perf_callchain_store(entry, next_ip); - - for (;;) { - fp = (unsigned long __user *) sp; - if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) - return; - if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) - return; - - /* - * Note: the next_sp - sp >= signal frame size check - * is true when next_sp < sp, which can happen when - * transitioning from an alternate signal stack to the - * normal stack. - */ - if (next_sp - sp >= sizeof(struct signal_frame_64) && - (is_sigreturn_64_address(next_ip, sp) || - (level <= 1 && is_sigreturn_64_address(lr, sp))) && - sane_signal_64_frame(sp)) { - /* - * This looks like an signal frame - */ - sigframe = (struct signal_frame_64 __user *) sp; - uregs = sigframe->uc.uc_mcontext.gp_regs; - if (read_user_stack_64(&uregs[PT_NIP], &next_ip) || - read_user_stack_64(&uregs[PT_LNK], &lr) || - read_user_stack_64(&uregs[PT_R1], &sp)) - return; - level = 0; - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_store(entry, next_ip); - continue; - } - - if (level == 0) - next_ip = lr; - perf_callchain_store(entry, next_ip); - ++level; - sp = next_sp; - } -} - -static inline int current_is_64bit(void) -{ - /* - * We can't use test_thread_flag() here because we may be on an - * interrupt stack, and the thread flags don't get copied over - * from the thread_info on the main stack to the interrupt stack. - */ - return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT); -} - -#else /* CONFIG_PPC64 */ -/* - * On 32-bit we just access the address and let hash_page create a - * HPTE if necessary, so there is no need to fall back to reading - * the page tables. Since this is called at interrupt level, - * do_page_fault() won't treat a DSI as a page fault. - */ -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) -{ - int rc; - - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - pagefault_disable(); - rc = __get_user_inatomic(*ret, ptr); - pagefault_enable(); - - return rc; -} - -static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static inline int current_is_64bit(void) -{ - return 0; -} - -static inline int valid_user_sp(unsigned long sp, int is_64) -{ - if (!sp || (sp & 7) || sp > TASK_SIZE - 32) - return 0; - return 1; -} - -#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE -#define sigcontext32 sigcontext -#define mcontext32 mcontext -#define ucontext32 ucontext -#define compat_siginfo_t struct siginfo - -#endif /* CONFIG_PPC64 */ - -/* - * Layout for non-RT signal frames - */ -struct signal_frame_32 { - char dummy[__SIGNAL_FRAMESIZE32]; - struct sigcontext32 sctx; - struct mcontext32 mctx; - int abigap[56]; -}; - -/* - * Layout for RT signal frames - */ -struct rt_signal_frame_32 { - char dummy[__SIGNAL_FRAMESIZE32 + 16]; - compat_siginfo_t info; - struct ucontext32 uc; - int abigap[56]; -}; - -static int is_sigreturn_32_address(unsigned int nip, unsigned int fp) -{ - if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad)) - return 1; - if (vdso32_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso32_sigtramp) - return 1; - return 0; -} - -static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp) -{ - if (nip == fp + offsetof(struct rt_signal_frame_32, - uc.uc_mcontext.mc_pad)) - return 1; - if (vdso32_rt_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso32_rt_sigtramp) - return 1; - return 0; -} - -static int sane_signal_32_frame(unsigned int sp) -{ - struct signal_frame_32 __user *sf; - unsigned int regs; - - sf = (struct signal_frame_32 __user *) (unsigned long) sp; - if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, ®s)) - return 0; - return regs == (unsigned long) &sf->mctx; -} - -static int sane_rt_signal_32_frame(unsigned int sp) -{ - struct rt_signal_frame_32 __user *sf; - unsigned int regs; - - sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; - if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, ®s)) - return 0; - return regs == (unsigned long) &sf->uc.uc_mcontext; -} - -static unsigned int __user *signal_frame_32_regs(unsigned int sp, - unsigned int next_sp, unsigned int next_ip) -{ - struct mcontext32 __user *mctx = NULL; - struct signal_frame_32 __user *sf; - struct rt_signal_frame_32 __user *rt_sf; - - /* - * Note: the next_sp - sp >= signal frame size check - * is true when next_sp < sp, for example, when - * transitioning from an alternate signal stack to the - * normal stack. - */ - if (next_sp - sp >= sizeof(struct signal_frame_32) && - is_sigreturn_32_address(next_ip, sp) && - sane_signal_32_frame(sp)) { - sf = (struct signal_frame_32 __user *) (unsigned long) sp; - mctx = &sf->mctx; - } - - if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) && - is_rt_sigreturn_32_address(next_ip, sp) && - sane_rt_signal_32_frame(sp)) { - rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; - mctx = &rt_sf->uc.uc_mcontext; - } - - if (!mctx) - return NULL; - return mctx->mc_gregs; -} - -static void perf_callchain_user_32(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ - unsigned int sp, next_sp; - unsigned int next_ip; - unsigned int lr; - long level = 0; - unsigned int __user *fp, *uregs; - - next_ip = regs->nip; - lr = regs->link; - sp = regs->gpr[1]; - perf_callchain_store(entry, next_ip); - - while (entry->nr < PERF_MAX_STACK_DEPTH) { - fp = (unsigned int __user *) (unsigned long) sp; - if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) - return; - if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) - return; - - uregs = signal_frame_32_regs(sp, next_sp, next_ip); - if (!uregs && level <= 1) - uregs = signal_frame_32_regs(sp, next_sp, lr); - if (uregs) { - /* - * This looks like an signal frame, so restart - * the stack trace with the values in it. - */ - if (read_user_stack_32(&uregs[PT_NIP], &next_ip) || - read_user_stack_32(&uregs[PT_LNK], &lr) || - read_user_stack_32(&uregs[PT_R1], &sp)) - return; - level = 0; - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_store(entry, next_ip); - continue; - } - - if (level == 0) - next_ip = lr; - perf_callchain_store(entry, next_ip); - ++level; - sp = next_sp; - } -} - -void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) -{ - if (current_is_64bit()) - perf_callchain_user_64(entry, regs); - else - perf_callchain_user_32(entry, regs); -} diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c deleted file mode 100644 index 64483fde95c6..000000000000 --- a/arch/powerpc/kernel/perf_event.c +++ /dev/null @@ -1,1438 +0,0 @@ -/* - * Performance event support - powerpc architecture code - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct cpu_hw_events { - int n_events; - int n_percpu; - int disabled; - int n_added; - int n_limited; - u8 pmcs_enabled; - struct perf_event *event[MAX_HWEVENTS]; - u64 events[MAX_HWEVENTS]; - unsigned int flags[MAX_HWEVENTS]; - unsigned long mmcr[3]; - struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS]; - u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; - u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; - unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; - unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; - - unsigned int group_flag; - int n_txn_start; -}; -DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); - -struct power_pmu *ppmu; - -/* - * Normally, to ignore kernel events we set the FCS (freeze counters - * in supervisor mode) bit in MMCR0, but if the kernel runs with the - * hypervisor bit set in the MSR, or if we are running on a processor - * where the hypervisor bit is forced to 1 (as on Apple G5 processors), - * then we need to use the FCHV bit to ignore kernel events. - */ -static unsigned int freeze_events_kernel = MMCR0_FCS; - -/* - * 32-bit doesn't have MMCRA but does have an MMCR2, - * and a few other names are different. - */ -#ifdef CONFIG_PPC32 - -#define MMCR0_FCHV 0 -#define MMCR0_PMCjCE MMCR0_PMCnCE - -#define SPRN_MMCRA SPRN_MMCR2 -#define MMCRA_SAMPLE_ENABLE 0 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_read_regs(struct pt_regs *regs) { } -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return 0; -} - -#endif /* CONFIG_PPC32 */ - -/* - * Things that are specific to 64-bit implementations. - */ -#ifdef CONFIG_PPC64 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { - unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; - if (slot > 1) - return 4 * (slot - 1); - } - return 0; -} - -/* - * The user wants a data address recorded. - * If we're not doing instruction sampling, give them the SDAR - * (sampled data address). If we are doing instruction sampling, then - * only give them the SDAR if it corresponds to the instruction - * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC - * bit in MMCRA. - */ -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) -{ - unsigned long mmcra = regs->dsisr; - unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? - POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; - - if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) - *addrp = mfspr(SPRN_SDAR); -} - -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - unsigned long sihv = MMCRA_SIHV; - unsigned long sipr = MMCRA_SIPR; - - if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ - - if (ppmu->flags & PPMU_ALT_SIPR) { - sihv = POWER6_MMCRA_SIHV; - sipr = POWER6_MMCRA_SIPR; - } - - /* PR has priority over HV, so order below is important */ - if (mmcra & sipr) - return PERF_RECORD_MISC_USER; - if ((mmcra & sihv) && (freeze_events_kernel != MMCR0_FCHV)) - return PERF_RECORD_MISC_HYPERVISOR; - return PERF_RECORD_MISC_KERNEL; -} - -/* - * Overload regs->dsisr to store MMCRA so we only need to read it once - * on each interrupt. - */ -static inline void perf_read_regs(struct pt_regs *regs) -{ - regs->dsisr = mfspr(SPRN_MMCRA); -} - -/* - * If interrupts were soft-disabled when a PMU interrupt occurs, treat - * it as an NMI. - */ -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return !regs->softe; -} - -#endif /* CONFIG_PPC64 */ - -static void perf_event_interrupt(struct pt_regs *regs); - -void perf_event_print_debug(void) -{ -} - -/* - * Read one performance monitor counter (PMC). - */ -static unsigned long read_pmc(int idx) -{ - unsigned long val; - - switch (idx) { - case 1: - val = mfspr(SPRN_PMC1); - break; - case 2: - val = mfspr(SPRN_PMC2); - break; - case 3: - val = mfspr(SPRN_PMC3); - break; - case 4: - val = mfspr(SPRN_PMC4); - break; - case 5: - val = mfspr(SPRN_PMC5); - break; - case 6: - val = mfspr(SPRN_PMC6); - break; -#ifdef CONFIG_PPC64 - case 7: - val = mfspr(SPRN_PMC7); - break; - case 8: - val = mfspr(SPRN_PMC8); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to read PMC%d\n", idx); - val = 0; - } - return val; -} - -/* - * Write one PMC. - */ -static void write_pmc(int idx, unsigned long val) -{ - switch (idx) { - case 1: - mtspr(SPRN_PMC1, val); - break; - case 2: - mtspr(SPRN_PMC2, val); - break; - case 3: - mtspr(SPRN_PMC3, val); - break; - case 4: - mtspr(SPRN_PMC4, val); - break; - case 5: - mtspr(SPRN_PMC5, val); - break; - case 6: - mtspr(SPRN_PMC6, val); - break; -#ifdef CONFIG_PPC64 - case 7: - mtspr(SPRN_PMC7, val); - break; - case 8: - mtspr(SPRN_PMC8, val); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to write PMC%d\n", idx); - } -} - -/* - * Check if a set of events can all go on the PMU at once. - * If they can't, this will look at alternative codes for the events - * and see if any combination of alternative codes is feasible. - * The feasible set is returned in event_id[]. - */ -static int power_check_constraints(struct cpu_hw_events *cpuhw, - u64 event_id[], unsigned int cflags[], - int n_ev) -{ - unsigned long mask, value, nv; - unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS]; - int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS]; - int i, j; - unsigned long addf = ppmu->add_fields; - unsigned long tadd = ppmu->test_adder; - - if (n_ev > ppmu->n_counter) - return -1; - - /* First see if the events will go on as-is */ - for (i = 0; i < n_ev; ++i) { - if ((cflags[i] & PPMU_LIMITED_PMC_REQD) - && !ppmu->limited_pmc_event(event_id[i])) { - ppmu->get_alternatives(event_id[i], cflags[i], - cpuhw->alternatives[i]); - event_id[i] = cpuhw->alternatives[i][0]; - } - if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0], - &cpuhw->avalues[i][0])) - return -1; - } - value = mask = 0; - for (i = 0; i < n_ev; ++i) { - nv = (value | cpuhw->avalues[i][0]) + - (value & cpuhw->avalues[i][0] & addf); - if ((((nv + tadd) ^ value) & mask) != 0 || - (((nv + tadd) ^ cpuhw->avalues[i][0]) & - cpuhw->amasks[i][0]) != 0) - break; - value = nv; - mask |= cpuhw->amasks[i][0]; - } - if (i == n_ev) - return 0; /* all OK */ - - /* doesn't work, gather alternatives... */ - if (!ppmu->get_alternatives) - return -1; - for (i = 0; i < n_ev; ++i) { - choice[i] = 0; - n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i], - cpuhw->alternatives[i]); - for (j = 1; j < n_alt[i]; ++j) - ppmu->get_constraint(cpuhw->alternatives[i][j], - &cpuhw->amasks[i][j], - &cpuhw->avalues[i][j]); - } - - /* enumerate all possibilities and see if any will work */ - i = 0; - j = -1; - value = mask = nv = 0; - while (i < n_ev) { - if (j >= 0) { - /* we're backtracking, restore context */ - value = svalues[i]; - mask = smasks[i]; - j = choice[i]; - } - /* - * See if any alternative k for event_id i, - * where k > j, will satisfy the constraints. - */ - while (++j < n_alt[i]) { - nv = (value | cpuhw->avalues[i][j]) + - (value & cpuhw->avalues[i][j] & addf); - if ((((nv + tadd) ^ value) & mask) == 0 && - (((nv + tadd) ^ cpuhw->avalues[i][j]) - & cpuhw->amasks[i][j]) == 0) - break; - } - if (j >= n_alt[i]) { - /* - * No feasible alternative, backtrack - * to event_id i-1 and continue enumerating its - * alternatives from where we got up to. - */ - if (--i < 0) - return -1; - } else { - /* - * Found a feasible alternative for event_id i, - * remember where we got up to with this event_id, - * go on to the next event_id, and start with - * the first alternative for it. - */ - choice[i] = j; - svalues[i] = value; - smasks[i] = mask; - value = nv; - mask |= cpuhw->amasks[i][j]; - ++i; - j = -1; - } - } - - /* OK, we have a feasible combination, tell the caller the solution */ - for (i = 0; i < n_ev; ++i) - event_id[i] = cpuhw->alternatives[i][choice[i]]; - return 0; -} - -/* - * Check if newly-added events have consistent settings for - * exclude_{user,kernel,hv} with each other and any previously - * added events. - */ -static int check_excludes(struct perf_event **ctrs, unsigned int cflags[], - int n_prev, int n_new) -{ - int eu = 0, ek = 0, eh = 0; - int i, n, first; - struct perf_event *event; - - n = n_prev + n_new; - if (n <= 1) - return 0; - - first = 1; - for (i = 0; i < n; ++i) { - if (cflags[i] & PPMU_LIMITED_PMC_OK) { - cflags[i] &= ~PPMU_LIMITED_PMC_REQD; - continue; - } - event = ctrs[i]; - if (first) { - eu = event->attr.exclude_user; - ek = event->attr.exclude_kernel; - eh = event->attr.exclude_hv; - first = 0; - } else if (event->attr.exclude_user != eu || - event->attr.exclude_kernel != ek || - event->attr.exclude_hv != eh) { - return -EAGAIN; - } - } - - if (eu || ek || eh) - for (i = 0; i < n; ++i) - if (cflags[i] & PPMU_LIMITED_PMC_OK) - cflags[i] |= PPMU_LIMITED_PMC_REQD; - - return 0; -} - -static u64 check_and_compute_delta(u64 prev, u64 val) -{ - u64 delta = (val - prev) & 0xfffffffful; - - /* - * POWER7 can roll back counter values, if the new value is smaller - * than the previous value it will cause the delta and the counter to - * have bogus values unless we rolled a counter over. If a coutner is - * rolled back, it will be smaller, but within 256, which is the maximum - * number of events to rollback at once. If we dectect a rollback - * return 0. This can lead to a small lack of precision in the - * counters. - */ - if (prev > val && (prev - val) < 256) - delta = 0; - - return delta; -} - -static void power_pmu_read(struct perf_event *event) -{ - s64 val, delta, prev; - - if (event->hw.state & PERF_HES_STOPPED) - return; - - if (!event->hw.idx) - return; - /* - * Performance monitor interrupts come even when interrupts - * are soft-disabled, as long as interrupts are hard-enabled. - * Therefore we treat them like NMIs. - */ - do { - prev = local64_read(&event->hw.prev_count); - barrier(); - val = read_pmc(event->hw.idx); - delta = check_and_compute_delta(prev, val); - if (!delta) - return; - } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); - - local64_add(delta, &event->count); - local64_sub(delta, &event->hw.period_left); -} - -/* - * On some machines, PMC5 and PMC6 can't be written, don't respect - * the freeze conditions, and don't generate interrupts. This tells - * us if `event' is using such a PMC. - */ -static int is_limited_pmc(int pmcnum) -{ - return (ppmu->flags & PPMU_LIMITED_PMC5_6) - && (pmcnum == 5 || pmcnum == 6); -} - -static void freeze_limited_counters(struct cpu_hw_events *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_event *event; - u64 val, prev, delta; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - event = cpuhw->limited_counter[i]; - if (!event->hw.idx) - continue; - val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = local64_read(&event->hw.prev_count); - event->hw.idx = 0; - delta = check_and_compute_delta(prev, val); - if (delta) - local64_add(delta, &event->count); - } -} - -static void thaw_limited_counters(struct cpu_hw_events *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_event *event; - u64 val, prev; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - event = cpuhw->limited_counter[i]; - event->hw.idx = cpuhw->limited_hwidx[i]; - val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = local64_read(&event->hw.prev_count); - if (check_and_compute_delta(prev, val)) - local64_set(&event->hw.prev_count, val); - perf_event_update_userpage(event); - } -} - -/* - * Since limited events don't respect the freeze conditions, we - * have to read them immediately after freezing or unfreezing the - * other events. We try to keep the values from the limited - * events as consistent as possible by keeping the delay (in - * cycles and instructions) between freezing/unfreezing and reading - * the limited events as small and consistent as possible. - * Therefore, if any limited events are in use, we read them - * both, and always in the same order, to minimize variability, - * and do it inside the same asm that writes MMCR0. - */ -static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) -{ - unsigned long pmc5, pmc6; - - if (!cpuhw->n_limited) { - mtspr(SPRN_MMCR0, mmcr0); - return; - } - - /* - * Write MMCR0, then read PMC5 and PMC6 immediately. - * To ensure we don't get a performance monitor interrupt - * between writing MMCR0 and freezing/thawing the limited - * events, we first write MMCR0 with the event overflow - * interrupt enable bits turned off. - */ - asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" - : "=&r" (pmc5), "=&r" (pmc6) - : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), - "i" (SPRN_MMCR0), - "i" (SPRN_PMC5), "i" (SPRN_PMC6)); - - if (mmcr0 & MMCR0_FC) - freeze_limited_counters(cpuhw, pmc5, pmc6); - else - thaw_limited_counters(cpuhw, pmc5, pmc6); - - /* - * Write the full MMCR0 including the event overflow interrupt - * enable bits, if necessary. - */ - if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) - mtspr(SPRN_MMCR0, mmcr0); -} - -/* - * Disable all events to prevent PMU interrupts and to allow - * events to be added or removed. - */ -static void power_pmu_disable(struct pmu *pmu) -{ - struct cpu_hw_events *cpuhw; - unsigned long flags; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_events); - - if (!cpuhw->disabled) { - cpuhw->disabled = 1; - cpuhw->n_added = 0; - - /* - * Check if we ever enabled the PMU on this cpu. - */ - if (!cpuhw->pmcs_enabled) { - ppc_enable_pmcs(); - cpuhw->pmcs_enabled = 1; - } - - /* - * Disable instruction sampling if it was enabled - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mtspr(SPRN_MMCRA, - cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mb(); - } - - /* - * Set the 'freeze counters' bit. - * The barrier is to make sure the mtspr has been - * executed and the PMU has frozen the events - * before we return. - */ - write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); - mb(); - } - local_irq_restore(flags); -} - -/* - * Re-enable all events if disable == 0. - * If we were previously disabled and events were added, then - * put the new config on the PMU. - */ -static void power_pmu_enable(struct pmu *pmu) -{ - struct perf_event *event; - struct cpu_hw_events *cpuhw; - unsigned long flags; - long i; - unsigned long val; - s64 left; - unsigned int hwc_index[MAX_HWEVENTS]; - int n_lim; - int idx; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_events); - if (!cpuhw->disabled) { - local_irq_restore(flags); - return; - } - cpuhw->disabled = 0; - - /* - * If we didn't change anything, or only removed events, - * no need to recalculate MMCR* settings and reset the PMCs. - * Just reenable the PMU with the current MMCR* settings - * (possibly updated for removal of events). - */ - if (!cpuhw->n_added) { - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - if (cpuhw->n_events == 0) - ppc_set_pmu_inuse(0); - goto out_enable; - } - - /* - * Compute MMCR* values for the new set of events - */ - if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index, - cpuhw->mmcr)) { - /* shouldn't ever get here */ - printk(KERN_ERR "oops compute_mmcr failed\n"); - goto out; - } - - /* - * Add in MMCR0 freeze bits corresponding to the - * attr.exclude_* bits for the first event. - * We have already checked that all events have the - * same values for these bits as the first event. - */ - event = cpuhw->event[0]; - if (event->attr.exclude_user) - cpuhw->mmcr[0] |= MMCR0_FCP; - if (event->attr.exclude_kernel) - cpuhw->mmcr[0] |= freeze_events_kernel; - if (event->attr.exclude_hv) - cpuhw->mmcr[0] |= MMCR0_FCHV; - - /* - * Write the new configuration to MMCR* with the freeze - * bit set and set the hardware events to their initial values. - * Then unfreeze the events. - */ - ppc_set_pmu_inuse(1); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) - | MMCR0_FC); - - /* - * Read off any pre-existing events that need to move - * to another PMC. - */ - for (i = 0; i < cpuhw->n_events; ++i) { - event = cpuhw->event[i]; - if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) { - power_pmu_read(event); - write_pmc(event->hw.idx, 0); - event->hw.idx = 0; - } - } - - /* - * Initialize the PMCs for all the new and moved events. - */ - cpuhw->n_limited = n_lim = 0; - for (i = 0; i < cpuhw->n_events; ++i) { - event = cpuhw->event[i]; - if (event->hw.idx) - continue; - idx = hwc_index[i] + 1; - if (is_limited_pmc(idx)) { - cpuhw->limited_counter[n_lim] = event; - cpuhw->limited_hwidx[n_lim] = idx; - ++n_lim; - continue; - } - val = 0; - if (event->hw.sample_period) { - left = local64_read(&event->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; - } - local64_set(&event->hw.prev_count, val); - event->hw.idx = idx; - if (event->hw.state & PERF_HES_STOPPED) - val = 0; - write_pmc(idx, val); - perf_event_update_userpage(event); - } - cpuhw->n_limited = n_lim; - cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; - - out_enable: - mb(); - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - /* - * Enable instruction sampling if necessary - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mb(); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); - } - - out: - local_irq_restore(flags); -} - -static int collect_events(struct perf_event *group, int max_count, - struct perf_event *ctrs[], u64 *events, - unsigned int *flags) -{ - int n = 0; - struct perf_event *event; - - if (!is_software_event(group)) { - if (n >= max_count) - return -1; - ctrs[n] = group; - flags[n] = group->hw.event_base; - events[n++] = group->hw.config; - } - list_for_each_entry(event, &group->sibling_list, group_entry) { - if (!is_software_event(event) && - event->state != PERF_EVENT_STATE_OFF) { - if (n >= max_count) - return -1; - ctrs[n] = event; - flags[n] = event->hw.event_base; - events[n++] = event->hw.config; - } - } - return n; -} - -/* - * Add a event to the PMU. - * If all events are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_enable to do the - * actual work of reconfiguring the PMU. - */ -static int power_pmu_add(struct perf_event *event, int ef_flags) -{ - struct cpu_hw_events *cpuhw; - unsigned long flags; - int n0; - int ret = -EAGAIN; - - local_irq_save(flags); - perf_pmu_disable(event->pmu); - - /* - * Add the event to the list (if there is room) - * and check whether the total set is still feasible. - */ - cpuhw = &__get_cpu_var(cpu_hw_events); - n0 = cpuhw->n_events; - if (n0 >= ppmu->n_counter) - goto out; - cpuhw->event[n0] = event; - cpuhw->events[n0] = event->hw.config; - cpuhw->flags[n0] = event->hw.event_base; - - if (!(ef_flags & PERF_EF_START)) - event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; - - /* - * If group events scheduling transaction was started, - * skip the schedulability test here, it will be performed - * at commit time(->commit_txn) as a whole - */ - if (cpuhw->group_flag & PERF_EVENT_TXN) - goto nocheck; - - if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) - goto out; - if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) - goto out; - event->hw.config = cpuhw->events[n0]; - -nocheck: - ++cpuhw->n_events; - ++cpuhw->n_added; - - ret = 0; - out: - perf_pmu_enable(event->pmu); - local_irq_restore(flags); - return ret; -} - -/* - * Remove a event from the PMU. - */ -static void power_pmu_del(struct perf_event *event, int ef_flags) -{ - struct cpu_hw_events *cpuhw; - long i; - unsigned long flags; - - local_irq_save(flags); - perf_pmu_disable(event->pmu); - - power_pmu_read(event); - - cpuhw = &__get_cpu_var(cpu_hw_events); - for (i = 0; i < cpuhw->n_events; ++i) { - if (event == cpuhw->event[i]) { - while (++i < cpuhw->n_events) { - cpuhw->event[i-1] = cpuhw->event[i]; - cpuhw->events[i-1] = cpuhw->events[i]; - cpuhw->flags[i-1] = cpuhw->flags[i]; - } - --cpuhw->n_events; - ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr); - if (event->hw.idx) { - write_pmc(event->hw.idx, 0); - event->hw.idx = 0; - } - perf_event_update_userpage(event); - break; - } - } - for (i = 0; i < cpuhw->n_limited; ++i) - if (event == cpuhw->limited_counter[i]) - break; - if (i < cpuhw->n_limited) { - while (++i < cpuhw->n_limited) { - cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; - cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; - } - --cpuhw->n_limited; - } - if (cpuhw->n_events == 0) { - /* disable exceptions if no events are running */ - cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); - } - - perf_pmu_enable(event->pmu); - local_irq_restore(flags); -} - -/* - * POWER-PMU does not support disabling individual counters, hence - * program their cycle counter to their max value and ignore the interrupts. - */ - -static void power_pmu_start(struct perf_event *event, int ef_flags) -{ - unsigned long flags; - s64 left; - unsigned long val; - - if (!event->hw.idx || !event->hw.sample_period) - return; - - if (!(event->hw.state & PERF_HES_STOPPED)) - return; - - if (ef_flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - - local_irq_save(flags); - perf_pmu_disable(event->pmu); - - event->hw.state = 0; - left = local64_read(&event->hw.period_left); - - val = 0; - if (left < 0x80000000L) - val = 0x80000000L - left; - - write_pmc(event->hw.idx, val); - - perf_event_update_userpage(event); - perf_pmu_enable(event->pmu); - local_irq_restore(flags); -} - -static void power_pmu_stop(struct perf_event *event, int ef_flags) -{ - unsigned long flags; - - if (!event->hw.idx || !event->hw.sample_period) - return; - - if (event->hw.state & PERF_HES_STOPPED) - return; - - local_irq_save(flags); - perf_pmu_disable(event->pmu); - - power_pmu_read(event); - event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; - write_pmc(event->hw.idx, 0); - - perf_event_update_userpage(event); - perf_pmu_enable(event->pmu); - local_irq_restore(flags); -} - -/* - * Start group events scheduling transaction - * Set the flag to make pmu::enable() not perform the - * schedulability test, it will be performed at commit time - */ -void power_pmu_start_txn(struct pmu *pmu) -{ - struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - - perf_pmu_disable(pmu); - cpuhw->group_flag |= PERF_EVENT_TXN; - cpuhw->n_txn_start = cpuhw->n_events; -} - -/* - * Stop group events scheduling transaction - * Clear the flag and pmu::enable() will perform the - * schedulability test. - */ -void power_pmu_cancel_txn(struct pmu *pmu) -{ - struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - - cpuhw->group_flag &= ~PERF_EVENT_TXN; - perf_pmu_enable(pmu); -} - -/* - * Commit group events scheduling transaction - * Perform the group schedulability test as a whole - * Return 0 if success - */ -int power_pmu_commit_txn(struct pmu *pmu) -{ - struct cpu_hw_events *cpuhw; - long i, n; - - if (!ppmu) - return -EAGAIN; - cpuhw = &__get_cpu_var(cpu_hw_events); - n = cpuhw->n_events; - if (check_excludes(cpuhw->event, cpuhw->flags, 0, n)) - return -EAGAIN; - i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n); - if (i < 0) - return -EAGAIN; - - for (i = cpuhw->n_txn_start; i < n; ++i) - cpuhw->event[i]->hw.config = cpuhw->events[i]; - - cpuhw->group_flag &= ~PERF_EVENT_TXN; - perf_pmu_enable(pmu); - return 0; -} - -/* - * Return 1 if we might be able to put event on a limited PMC, - * or 0 if not. - * A event can only go on a limited PMC if it counts something - * that a limited PMC can count, doesn't require interrupts, and - * doesn't exclude any processor mode. - */ -static int can_go_on_limited_pmc(struct perf_event *event, u64 ev, - unsigned int flags) -{ - int n; - u64 alt[MAX_EVENT_ALTERNATIVES]; - - if (event->attr.exclude_user - || event->attr.exclude_kernel - || event->attr.exclude_hv - || event->attr.sample_period) - return 0; - - if (ppmu->limited_pmc_event(ev)) - return 1; - - /* - * The requested event_id isn't on a limited PMC already; - * see if any alternative code goes on a limited PMC. - */ - if (!ppmu->get_alternatives) - return 0; - - flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; - n = ppmu->get_alternatives(ev, flags, alt); - - return n > 0; -} - -/* - * Find an alternative event_id that goes on a normal PMC, if possible, - * and return the event_id code, or 0 if there is no such alternative. - * (Note: event_id code 0 is "don't count" on all machines.) - */ -static u64 normal_pmc_alternative(u64 ev, unsigned long flags) -{ - u64 alt[MAX_EVENT_ALTERNATIVES]; - int n; - - flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); - n = ppmu->get_alternatives(ev, flags, alt); - if (!n) - return 0; - return alt[0]; -} - -/* Number of perf_events counting hardware events */ -static atomic_t num_events; -/* Used to avoid races in calling reserve/release_pmc_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - -/* - * Release the PMU if this is the last perf_event. - */ -static void hw_perf_event_destroy(struct perf_event *event) -{ - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - release_pmc_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -/* - * Translate a generic cache event_id config to a raw event_id code. - */ -static int hw_perf_cache_event(u64 config, u64 *eventp) -{ - unsigned long type, op, result; - int ev; - - if (!ppmu->cache_events) - return -EINVAL; - - /* unpack config */ - type = config & 0xff; - op = (config >> 8) & 0xff; - result = (config >> 16) & 0xff; - - if (type >= PERF_COUNT_HW_CACHE_MAX || - op >= PERF_COUNT_HW_CACHE_OP_MAX || - result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - ev = (*ppmu->cache_events)[type][op][result]; - if (ev == 0) - return -EOPNOTSUPP; - if (ev == -1) - return -EINVAL; - *eventp = ev; - return 0; -} - -static int power_pmu_event_init(struct perf_event *event) -{ - u64 ev; - unsigned long flags; - struct perf_event *ctrs[MAX_HWEVENTS]; - u64 events[MAX_HWEVENTS]; - unsigned int cflags[MAX_HWEVENTS]; - int n; - int err; - struct cpu_hw_events *cpuhw; - - if (!ppmu) - return -ENOENT; - - switch (event->attr.type) { - case PERF_TYPE_HARDWARE: - ev = event->attr.config; - if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return -EOPNOTSUPP; - ev = ppmu->generic_events[ev]; - break; - case PERF_TYPE_HW_CACHE: - err = hw_perf_cache_event(event->attr.config, &ev); - if (err) - return err; - break; - case PERF_TYPE_RAW: - ev = event->attr.config; - break; - default: - return -ENOENT; - } - - event->hw.config_base = ev; - event->hw.idx = 0; - - /* - * If we are not running on a hypervisor, force the - * exclude_hv bit to 0 so that we don't care what - * the user set it to. - */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - event->attr.exclude_hv = 0; - - /* - * If this is a per-task event, then we can use - * PM_RUN_* events interchangeably with their non RUN_* - * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. - * XXX we should check if the task is an idle task. - */ - flags = 0; - if (event->attach_state & PERF_ATTACH_TASK) - flags |= PPMU_ONLY_COUNT_RUN; - - /* - * If this machine has limited events, check whether this - * event_id could go on a limited event. - */ - if (ppmu->flags & PPMU_LIMITED_PMC5_6) { - if (can_go_on_limited_pmc(event, ev, flags)) { - flags |= PPMU_LIMITED_PMC_OK; - } else if (ppmu->limited_pmc_event(ev)) { - /* - * The requested event_id is on a limited PMC, - * but we can't use a limited PMC; see if any - * alternative goes on a normal PMC. - */ - ev = normal_pmc_alternative(ev, flags); - if (!ev) - return -EINVAL; - } - } - - /* - * If this is in a group, check if it can go on with all the - * other hardware events in the group. We assume the event - * hasn't been linked into its leader's sibling list at this point. - */ - n = 0; - if (event->group_leader != event) { - n = collect_events(event->group_leader, ppmu->n_counter - 1, - ctrs, events, cflags); - if (n < 0) - return -EINVAL; - } - events[n] = ev; - ctrs[n] = event; - cflags[n] = flags; - if (check_excludes(ctrs, cflags, n, 1)) - return -EINVAL; - - cpuhw = &get_cpu_var(cpu_hw_events); - err = power_check_constraints(cpuhw, events, cflags, n + 1); - put_cpu_var(cpu_hw_events); - if (err) - return -EINVAL; - - event->hw.config = events[n]; - event->hw.event_base = cflags[n]; - event->hw.last_period = event->hw.sample_period; - local64_set(&event->hw.period_left, event->hw.last_period); - - /* - * See if we need to reserve the PMU. - * If no events are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing - * reserve_pmc_hardware or release_pmc_hardware. - */ - err = 0; - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && - reserve_pmc_hardware(perf_event_interrupt)) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); - } - event->destroy = hw_perf_event_destroy; - - return err; -} - -struct pmu power_pmu = { - .pmu_enable = power_pmu_enable, - .pmu_disable = power_pmu_disable, - .event_init = power_pmu_event_init, - .add = power_pmu_add, - .del = power_pmu_del, - .start = power_pmu_start, - .stop = power_pmu_stop, - .read = power_pmu_read, - .start_txn = power_pmu_start_txn, - .cancel_txn = power_pmu_cancel_txn, - .commit_txn = power_pmu_commit_txn, -}; - -/* - * A counter has overflowed; update its count and record - * things if requested. Note that interrupts are hard-disabled - * here so there is no possibility of being interrupted. - */ -static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs) -{ - u64 period = event->hw.sample_period; - s64 prev, delta, left; - int record = 0; - - if (event->hw.state & PERF_HES_STOPPED) { - write_pmc(event->hw.idx, 0); - return; - } - - /* we don't have to worry about interrupts here */ - prev = local64_read(&event->hw.prev_count); - delta = check_and_compute_delta(prev, val); - local64_add(delta, &event->count); - - /* - * See if the total period for this event has expired, - * and update for the next period. - */ - val = 0; - left = local64_read(&event->hw.period_left) - delta; - if (period) { - if (left <= 0) { - left += period; - if (left <= 0) - left = period; - record = 1; - event->hw.last_period = event->hw.sample_period; - } - if (left < 0x80000000LL) - val = 0x80000000LL - left; - } - - write_pmc(event->hw.idx, val); - local64_set(&event->hw.prev_count, val); - local64_set(&event->hw.period_left, left); - perf_event_update_userpage(event); - - /* - * Finally record data if requested. - */ - if (record) { - struct perf_sample_data data; - - perf_sample_data_init(&data, ~0ULL); - data.period = event->hw.last_period; - - if (event->attr.sample_type & PERF_SAMPLE_ADDR) - perf_get_data_addr(regs, &data.addr); - - if (perf_event_overflow(event, &data, regs)) - power_pmu_stop(event, 0); - } -} - -/* - * Called from generic code to get the misc flags (i.e. processor mode) - * for an event_id. - */ -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - u32 flags = perf_get_misc_flags(regs); - - if (flags) - return flags; - return user_mode(regs) ? PERF_RECORD_MISC_USER : - PERF_RECORD_MISC_KERNEL; -} - -/* - * Called from generic code to get the instruction pointer - * for an event_id. - */ -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - unsigned long ip; - - if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ - - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; -} - -static bool pmc_overflow(unsigned long val) -{ - if ((int)val < 0) - return true; - - /* - * Events on POWER7 can roll back if a speculative event doesn't - * eventually complete. Unfortunately in some rare cases they will - * raise a performance monitor exception. We need to catch this to - * ensure we reset the PMC. In all cases the PMC will be 256 or less - * cycles from overflow. - * - * We only do this if the first pass fails to find any overflowing - * PMCs because a user might set a period of less than 256 and we - * don't want to mistakenly reset them. - */ - if (__is_processor(PV_POWER7) && ((0x80000000 - val) <= 256)) - return true; - - return false; -} - -/* - * Performance monitor interrupt stuff - */ -static void perf_event_interrupt(struct pt_regs *regs) -{ - int i; - struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - struct perf_event *event; - unsigned long val; - int found = 0; - int nmi; - - if (cpuhw->n_limited) - freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), - mfspr(SPRN_PMC6)); - - perf_read_regs(regs); - - nmi = perf_intr_is_nmi(regs); - if (nmi) - nmi_enter(); - else - irq_enter(); - - for (i = 0; i < cpuhw->n_events; ++i) { - event = cpuhw->event[i]; - if (!event->hw.idx || is_limited_pmc(event->hw.idx)) - continue; - val = read_pmc(event->hw.idx); - if ((int)val < 0) { - /* event has overflowed */ - found = 1; - record_and_restart(event, val, regs); - } - } - - /* - * In case we didn't find and reset the event that caused - * the interrupt, scan all events and reset any that are - * negative, to avoid getting continual interrupts. - * Any that we processed in the previous loop will not be negative. - */ - if (!found) { - for (i = 0; i < ppmu->n_counter; ++i) { - if (is_limited_pmc(i + 1)) - continue; - val = read_pmc(i + 1); - if (pmc_overflow(val)) - write_pmc(i + 1, 0); - } - } - - /* - * Reset MMCR0 to its normal value. This will set PMXE and - * clear FC (freeze counters) and PMAO (perf mon alert occurred) - * and thus allow interrupts to occur again. - * XXX might want to use MSR.PM to keep the events frozen until - * we get back out of this interrupt. - */ - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - if (nmi) - nmi_exit(); - else - irq_exit(); -} - -static void power_pmu_setup(int cpu) -{ - struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); - - if (!ppmu) - return; - memset(cpuhw, 0, sizeof(*cpuhw)); - cpuhw->mmcr[0] = MMCR0_FC; -} - -static int __cpuinit -power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - power_pmu_setup(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -int __cpuinit register_power_pmu(struct power_pmu *pmu) -{ - if (ppmu) - return -EBUSY; /* something's already registered */ - - ppmu = pmu; - pr_info("%s performance monitor hardware support registered\n", - pmu->name); - -#ifdef MSR_HV - /* - * Use FCHV to ignore kernel events if MSR.HV is set. - */ - if (mfmsr() & MSR_HV) - freeze_events_kernel = MMCR0_FCHV; -#endif /* CONFIG_PPC64 */ - - perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW); - perf_cpu_notifier(power_pmu_notifier); - - return 0; -} diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c deleted file mode 100644 index 0a6d2a9d569c..000000000000 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * Performance event support - Freescale Embedded Performance Monitor - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * Copyright 2010 Freescale Semiconductor, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct cpu_hw_events { - int n_events; - int disabled; - u8 pmcs_enabled; - struct perf_event *event[MAX_HWEVENTS]; -}; -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); - -static struct fsl_emb_pmu *ppmu; - -/* Number of perf_events counting hardware events */ -static atomic_t num_events; -/* Used to avoid races in calling reserve/release_pmc_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - -/* - * If interrupts were soft-disabled when a PMU interrupt occurs, treat - * it as an NMI. - */ -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ -#ifdef __powerpc64__ - return !regs->softe; -#else - return 0; -#endif -} - -static void perf_event_interrupt(struct pt_regs *regs); - -/* - * Read one performance monitor counter (PMC). - */ -static unsigned long read_pmc(int idx) -{ - unsigned long val; - - switch (idx) { - case 0: - val = mfpmr(PMRN_PMC0); - break; - case 1: - val = mfpmr(PMRN_PMC1); - break; - case 2: - val = mfpmr(PMRN_PMC2); - break; - case 3: - val = mfpmr(PMRN_PMC3); - break; - default: - printk(KERN_ERR "oops trying to read PMC%d\n", idx); - val = 0; - } - return val; -} - -/* - * Write one PMC. - */ -static void write_pmc(int idx, unsigned long val) -{ - switch (idx) { - case 0: - mtpmr(PMRN_PMC0, val); - break; - case 1: - mtpmr(PMRN_PMC1, val); - break; - case 2: - mtpmr(PMRN_PMC2, val); - break; - case 3: - mtpmr(PMRN_PMC3, val); - break; - default: - printk(KERN_ERR "oops trying to write PMC%d\n", idx); - } - - isync(); -} - -/* - * Write one local control A register - */ -static void write_pmlca(int idx, unsigned long val) -{ - switch (idx) { - case 0: - mtpmr(PMRN_PMLCA0, val); - break; - case 1: - mtpmr(PMRN_PMLCA1, val); - break; - case 2: - mtpmr(PMRN_PMLCA2, val); - break; - case 3: - mtpmr(PMRN_PMLCA3, val); - break; - default: - printk(KERN_ERR "oops trying to write PMLCA%d\n", idx); - } - - isync(); -} - -/* - * Write one local control B register - */ -static void write_pmlcb(int idx, unsigned long val) -{ - switch (idx) { - case 0: - mtpmr(PMRN_PMLCB0, val); - break; - case 1: - mtpmr(PMRN_PMLCB1, val); - break; - case 2: - mtpmr(PMRN_PMLCB2, val); - break; - case 3: - mtpmr(PMRN_PMLCB3, val); - break; - default: - printk(KERN_ERR "oops trying to write PMLCB%d\n", idx); - } - - isync(); -} - -static void fsl_emb_pmu_read(struct perf_event *event) -{ - s64 val, delta, prev; - - if (event->hw.state & PERF_HES_STOPPED) - return; - - /* - * Performance monitor interrupts come even when interrupts - * are soft-disabled, as long as interrupts are hard-enabled. - * Therefore we treat them like NMIs. - */ - do { - prev = local64_read(&event->hw.prev_count); - barrier(); - val = read_pmc(event->hw.idx); - } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); - - /* The counters are only 32 bits wide */ - delta = (val - prev) & 0xfffffffful; - local64_add(delta, &event->count); - local64_sub(delta, &event->hw.period_left); -} - -/* - * Disable all events to prevent PMU interrupts and to allow - * events to be added or removed. - */ -static void fsl_emb_pmu_disable(struct pmu *pmu) -{ - struct cpu_hw_events *cpuhw; - unsigned long flags; - - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_events); - - if (!cpuhw->disabled) { - cpuhw->disabled = 1; - - /* - * Check if we ever enabled the PMU on this cpu. - */ - if (!cpuhw->pmcs_enabled) { - ppc_enable_pmcs(); - cpuhw->pmcs_enabled = 1; - } - - if (atomic_read(&num_events)) { - /* - * Set the 'freeze all counters' bit, and disable - * interrupts. The barrier is to make sure the - * mtpmr has been executed and the PMU has frozen - * the events before we return. - */ - - mtpmr(PMRN_PMGC0, PMGC0_FAC); - isync(); - } - } - local_irq_restore(flags); -} - -/* - * Re-enable all events if disable == 0. - * If we were previously disabled and events were added, then - * put the new config on the PMU. - */ -static void fsl_emb_pmu_enable(struct pmu *pmu) -{ - struct cpu_hw_events *cpuhw; - unsigned long flags; - - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_events); - if (!cpuhw->disabled) - goto out; - - cpuhw->disabled = 0; - ppc_set_pmu_inuse(cpuhw->n_events != 0); - - if (cpuhw->n_events > 0) { - mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE); - isync(); - } - - out: - local_irq_restore(flags); -} - -static int collect_events(struct perf_event *group, int max_count, - struct perf_event *ctrs[]) -{ - int n = 0; - struct perf_event *event; - - if (!is_software_event(group)) { - if (n >= max_count) - return -1; - ctrs[n] = group; - n++; - } - list_for_each_entry(event, &group->sibling_list, group_entry) { - if (!is_software_event(event) && - event->state != PERF_EVENT_STATE_OFF) { - if (n >= max_count) - return -1; - ctrs[n] = event; - n++; - } - } - return n; -} - -/* context locked on entry */ -static int fsl_emb_pmu_add(struct perf_event *event, int flags) -{ - struct cpu_hw_events *cpuhw; - int ret = -EAGAIN; - int num_counters = ppmu->n_counter; - u64 val; - int i; - - perf_pmu_disable(event->pmu); - cpuhw = &get_cpu_var(cpu_hw_events); - - if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) - num_counters = ppmu->n_restricted; - - /* - * Allocate counters from top-down, so that restricted-capable - * counters are kept free as long as possible. - */ - for (i = num_counters - 1; i >= 0; i--) { - if (cpuhw->event[i]) - continue; - - break; - } - - if (i < 0) - goto out; - - event->hw.idx = i; - cpuhw->event[i] = event; - ++cpuhw->n_events; - - val = 0; - if (event->hw.sample_period) { - s64 left = local64_read(&event->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; - } - local64_set(&event->hw.prev_count, val); - - if (!(flags & PERF_EF_START)) { - event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; - val = 0; - } - - write_pmc(i, val); - perf_event_update_userpage(event); - - write_pmlcb(i, event->hw.config >> 32); - write_pmlca(i, event->hw.config_base); - - ret = 0; - out: - put_cpu_var(cpu_hw_events); - perf_pmu_enable(event->pmu); - return ret; -} - -/* context locked on entry */ -static void fsl_emb_pmu_del(struct perf_event *event, int flags) -{ - struct cpu_hw_events *cpuhw; - int i = event->hw.idx; - - perf_pmu_disable(event->pmu); - if (i < 0) - goto out; - - fsl_emb_pmu_read(event); - - cpuhw = &get_cpu_var(cpu_hw_events); - - WARN_ON(event != cpuhw->event[event->hw.idx]); - - write_pmlca(i, 0); - write_pmlcb(i, 0); - write_pmc(i, 0); - - cpuhw->event[i] = NULL; - event->hw.idx = -1; - - /* - * TODO: if at least one restricted event exists, and we - * just freed up a non-restricted-capable counter, and - * there is a restricted-capable counter occupied by - * a non-restricted event, migrate that event to the - * vacated counter. - */ - - cpuhw->n_events--; - - out: - perf_pmu_enable(event->pmu); - put_cpu_var(cpu_hw_events); -} - -static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags) -{ - unsigned long flags; - s64 left; - - if (event->hw.idx < 0 || !event->hw.sample_period) - return; - - if (!(event->hw.state & PERF_HES_STOPPED)) - return; - - if (ef_flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - - local_irq_save(flags); - perf_pmu_disable(event->pmu); - - event->hw.state = 0; - left = local64_read(&event->hw.period_left); - write_pmc(event->hw.idx, left); - - perf_event_update_userpage(event); - perf_pmu_enable(event->pmu); - local_irq_restore(flags); -} - -static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags) -{ - unsigned long flags; - - if (event->hw.idx < 0 || !event->hw.sample_period) - return; - - if (event->hw.state & PERF_HES_STOPPED) - return; - - local_irq_save(flags); - perf_pmu_disable(event->pmu); - - fsl_emb_pmu_read(event); - event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; - write_pmc(event->hw.idx, 0); - - perf_event_update_userpage(event); - perf_pmu_enable(event->pmu); - local_irq_restore(flags); -} - -/* - * Release the PMU if this is the last perf_event. - */ -static void hw_perf_event_destroy(struct perf_event *event) -{ - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - release_pmc_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -/* - * Translate a generic cache event_id config to a raw event_id code. - */ -static int hw_perf_cache_event(u64 config, u64 *eventp) -{ - unsigned long type, op, result; - int ev; - - if (!ppmu->cache_events) - return -EINVAL; - - /* unpack config */ - type = config & 0xff; - op = (config >> 8) & 0xff; - result = (config >> 16) & 0xff; - - if (type >= PERF_COUNT_HW_CACHE_MAX || - op >= PERF_COUNT_HW_CACHE_OP_MAX || - result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - ev = (*ppmu->cache_events)[type][op][result]; - if (ev == 0) - return -EOPNOTSUPP; - if (ev == -1) - return -EINVAL; - *eventp = ev; - return 0; -} - -static int fsl_emb_pmu_event_init(struct perf_event *event) -{ - u64 ev; - struct perf_event *events[MAX_HWEVENTS]; - int n; - int err; - int num_restricted; - int i; - - switch (event->attr.type) { - case PERF_TYPE_HARDWARE: - ev = event->attr.config; - if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return -EOPNOTSUPP; - ev = ppmu->generic_events[ev]; - break; - - case PERF_TYPE_HW_CACHE: - err = hw_perf_cache_event(event->attr.config, &ev); - if (err) - return err; - break; - - case PERF_TYPE_RAW: - ev = event->attr.config; - break; - - default: - return -ENOENT; - } - - event->hw.config = ppmu->xlate_event(ev); - if (!(event->hw.config & FSL_EMB_EVENT_VALID)) - return -EINVAL; - - /* - * If this is in a group, check if it can go on with all the - * other hardware events in the group. We assume the event - * hasn't been linked into its leader's sibling list at this point. - */ - n = 0; - if (event->group_leader != event) { - n = collect_events(event->group_leader, - ppmu->n_counter - 1, events); - if (n < 0) - return -EINVAL; - } - - if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) { - num_restricted = 0; - for (i = 0; i < n; i++) { - if (events[i]->hw.config & FSL_EMB_EVENT_RESTRICTED) - num_restricted++; - } - - if (num_restricted >= ppmu->n_restricted) - return -EINVAL; - } - - event->hw.idx = -1; - - event->hw.config_base = PMLCA_CE | PMLCA_FCM1 | - (u32)((ev << 16) & PMLCA_EVENT_MASK); - - if (event->attr.exclude_user) - event->hw.config_base |= PMLCA_FCU; - if (event->attr.exclude_kernel) - event->hw.config_base |= PMLCA_FCS; - if (event->attr.exclude_idle) - return -ENOTSUPP; - - event->hw.last_period = event->hw.sample_period; - local64_set(&event->hw.period_left, event->hw.last_period); - - /* - * See if we need to reserve the PMU. - * If no events are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing - * reserve_pmc_hardware or release_pmc_hardware. - */ - err = 0; - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && - reserve_pmc_hardware(perf_event_interrupt)) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); - - mtpmr(PMRN_PMGC0, PMGC0_FAC); - isync(); - } - event->destroy = hw_perf_event_destroy; - - return err; -} - -static struct pmu fsl_emb_pmu = { - .pmu_enable = fsl_emb_pmu_enable, - .pmu_disable = fsl_emb_pmu_disable, - .event_init = fsl_emb_pmu_event_init, - .add = fsl_emb_pmu_add, - .del = fsl_emb_pmu_del, - .start = fsl_emb_pmu_start, - .stop = fsl_emb_pmu_stop, - .read = fsl_emb_pmu_read, -}; - -/* - * A counter has overflowed; update its count and record - * things if requested. Note that interrupts are hard-disabled - * here so there is no possibility of being interrupted. - */ -static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs) -{ - u64 period = event->hw.sample_period; - s64 prev, delta, left; - int record = 0; - - if (event->hw.state & PERF_HES_STOPPED) { - write_pmc(event->hw.idx, 0); - return; - } - - /* we don't have to worry about interrupts here */ - prev = local64_read(&event->hw.prev_count); - delta = (val - prev) & 0xfffffffful; - local64_add(delta, &event->count); - - /* - * See if the total period for this event has expired, - * and update for the next period. - */ - val = 0; - left = local64_read(&event->hw.period_left) - delta; - if (period) { - if (left <= 0) { - left += period; - if (left <= 0) - left = period; - record = 1; - event->hw.last_period = event->hw.sample_period; - } - if (left < 0x80000000LL) - val = 0x80000000LL - left; - } - - write_pmc(event->hw.idx, val); - local64_set(&event->hw.prev_count, val); - local64_set(&event->hw.period_left, left); - perf_event_update_userpage(event); - - /* - * Finally record data if requested. - */ - if (record) { - struct perf_sample_data data; - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - - if (perf_event_overflow(event, &data, regs)) - fsl_emb_pmu_stop(event, 0); - } -} - -static void perf_event_interrupt(struct pt_regs *regs) -{ - int i; - struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - struct perf_event *event; - unsigned long val; - int found = 0; - int nmi; - - nmi = perf_intr_is_nmi(regs); - if (nmi) - nmi_enter(); - else - irq_enter(); - - for (i = 0; i < ppmu->n_counter; ++i) { - event = cpuhw->event[i]; - - val = read_pmc(i); - if ((int)val < 0) { - if (event) { - /* event has overflowed */ - found = 1; - record_and_restart(event, val, regs); - } else { - /* - * Disabled counter is negative, - * reset it just in case. - */ - write_pmc(i, 0); - } - } - } - - /* PMM will keep counters frozen until we return from the interrupt. */ - mtmsr(mfmsr() | MSR_PMM); - mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE); - isync(); - - if (nmi) - nmi_exit(); - else - irq_exit(); -} - -void hw_perf_event_setup(int cpu) -{ - struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); - - memset(cpuhw, 0, sizeof(*cpuhw)); -} - -int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) -{ - if (ppmu) - return -EBUSY; /* something's already registered */ - - ppmu = pmu; - pr_info("%s performance monitor hardware support registered\n", - pmu->name); - - perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW); - - return 0; -} diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c deleted file mode 100644 index b4f1dda4d089..000000000000 --- a/arch/powerpc/kernel/power4-pmu.c +++ /dev/null @@ -1,621 +0,0 @@ -/* - * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors. - * - * Copyright 2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - -/* - * Bits in event code for POWER4 - */ -#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ -#define PM_PMC_MSK 0xf -#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ -#define PM_UNIT_MSK 0xf -#define PM_LOWER_SH 6 -#define PM_LOWER_MSK 1 -#define PM_LOWER_MSKS 0x40 -#define PM_BYTE_SH 4 /* Byte number of event bus to use */ -#define PM_BYTE_MSK 3 -#define PM_PMCSEL_MSK 7 - -/* - * Unit code values - */ -#define PM_FPU 1 -#define PM_ISU1 2 -#define PM_IFU 3 -#define PM_IDU0 4 -#define PM_ISU1_ALT 6 -#define PM_ISU2 7 -#define PM_IFU_ALT 8 -#define PM_LSU0 9 -#define PM_LSU1 0xc -#define PM_GPS 0xf - -/* - * Bits in MMCR0 for POWER4 - */ -#define MMCR0_PMC1SEL_SH 8 -#define MMCR0_PMC2SEL_SH 1 -#define MMCR_PMCSEL_MSK 0x1f - -/* - * Bits in MMCR1 for POWER4 - */ -#define MMCR1_TTM0SEL_SH 62 -#define MMCR1_TTC0SEL_SH 61 -#define MMCR1_TTM1SEL_SH 59 -#define MMCR1_TTC1SEL_SH 58 -#define MMCR1_TTM2SEL_SH 56 -#define MMCR1_TTC2SEL_SH 55 -#define MMCR1_TTM3SEL_SH 53 -#define MMCR1_TTC3SEL_SH 52 -#define MMCR1_TTMSEL_MSK 3 -#define MMCR1_TD_CP_DBG0SEL_SH 50 -#define MMCR1_TD_CP_DBG1SEL_SH 48 -#define MMCR1_TD_CP_DBG2SEL_SH 46 -#define MMCR1_TD_CP_DBG3SEL_SH 44 -#define MMCR1_DEBUG0SEL_SH 43 -#define MMCR1_DEBUG1SEL_SH 42 -#define MMCR1_DEBUG2SEL_SH 41 -#define MMCR1_DEBUG3SEL_SH 40 -#define MMCR1_PMC1_ADDER_SEL_SH 39 -#define MMCR1_PMC2_ADDER_SEL_SH 38 -#define MMCR1_PMC6_ADDER_SEL_SH 37 -#define MMCR1_PMC5_ADDER_SEL_SH 36 -#define MMCR1_PMC8_ADDER_SEL_SH 35 -#define MMCR1_PMC7_ADDER_SEL_SH 34 -#define MMCR1_PMC3_ADDER_SEL_SH 33 -#define MMCR1_PMC4_ADDER_SEL_SH 32 -#define MMCR1_PMC3SEL_SH 27 -#define MMCR1_PMC4SEL_SH 22 -#define MMCR1_PMC5SEL_SH 17 -#define MMCR1_PMC6SEL_SH 12 -#define MMCR1_PMC7SEL_SH 7 -#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */ - -static short mmcr1_adder_bits[8] = { - MMCR1_PMC1_ADDER_SEL_SH, - MMCR1_PMC2_ADDER_SEL_SH, - MMCR1_PMC3_ADDER_SEL_SH, - MMCR1_PMC4_ADDER_SEL_SH, - MMCR1_PMC5_ADDER_SEL_SH, - MMCR1_PMC6_ADDER_SEL_SH, - MMCR1_PMC7_ADDER_SEL_SH, - MMCR1_PMC8_ADDER_SEL_SH -}; - -/* - * Bits in MMCRA - */ -#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */ - -/* - * Layout of constraint bits: - * 6666555555555544444444443333333333222222222211111111110000000000 - * 3210987654321098765432109876543210987654321098765432109876543210 - * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><> - * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 - * \SMPL ||\TTC3SEL - * |\TTC_IFU_SEL - * \TTM2SEL0 - * - * SMPL - SAMPLE_ENABLE constraint - * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000 - * - * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2 - * 55: UC1 error 0x0080_0000_0000_0000 - * 54: FPU events needed 0x0040_0000_0000_0000 - * 53: ISU1 events needed 0x0020_0000_0000_0000 - * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000 - * - * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0 - * 51: UC2 error 0x0008_0000_0000_0000 - * 50: FPU events needed 0x0004_0000_0000_0000 - * 49: IFU events needed 0x0002_0000_0000_0000 - * 48: LSU0 events needed 0x0001_0000_0000_0000 - * - * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1 - * 47: UC3 error 0x8000_0000_0000 - * 46: LSU0 events needed 0x4000_0000_0000 - * 45: IFU events needed 0x2000_0000_0000 - * 44: IDU0|ISU2 events needed 0x1000_0000_0000 - * 43: ISU1 events needed 0x0800_0000_0000 - * - * TTM2SEL0 - * 42: 0 = IDU0 events needed - * 1 = ISU2 events needed 0x0400_0000_0000 - * - * TTC_IFU_SEL - * 41: 0 = IFU.U events needed - * 1 = IFU.L events needed 0x0200_0000_0000 - * - * TTC3SEL - * 40: 0 = LSU1.U events needed - * 1 = LSU1.L events needed 0x0100_0000_0000 - * - * PS1 - * 39: PS1 error 0x0080_0000_0000 - * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 - * - * PS2 - * 35: PS2 error 0x0008_0000_0000 - * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 - * - * B0 - * 28-31: Byte 0 event source 0xf000_0000 - * 1 = FPU - * 2 = ISU1 - * 3 = IFU - * 4 = IDU0 - * 7 = ISU2 - * 9 = LSU0 - * c = LSU1 - * f = GPS - * - * B1, B2, B3 - * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources - * - * P8 - * 15: P8 error 0x8000 - * 14-15: Count of events needing PMC8 - * - * P1..P7 - * 0-13: Count of events needing PMC1..PMC7 - * - * Note: this doesn't allow events using IFU.U to be combined with events - * using IFU.L, though that is feasible (using TTM0 and TTM2). However - * there are no listed events for IFU.L (they are debug events not - * verified for performance monitoring) so this shouldn't cause a - * problem. - */ - -static struct unitinfo { - unsigned long value, mask; - int unit; - int lowerbit; -} p4_unitinfo[16] = { - [PM_FPU] = { 0x44000000000000ul, 0x88000000000000ul, PM_FPU, 0 }, - [PM_ISU1] = { 0x20080000000000ul, 0x88000000000000ul, PM_ISU1, 0 }, - [PM_ISU1_ALT] = - { 0x20080000000000ul, 0x88000000000000ul, PM_ISU1, 0 }, - [PM_IFU] = { 0x02200000000000ul, 0x08820000000000ul, PM_IFU, 41 }, - [PM_IFU_ALT] = - { 0x02200000000000ul, 0x08820000000000ul, PM_IFU, 41 }, - [PM_IDU0] = { 0x10100000000000ul, 0x80840000000000ul, PM_IDU0, 1 }, - [PM_ISU2] = { 0x10140000000000ul, 0x80840000000000ul, PM_ISU2, 0 }, - [PM_LSU0] = { 0x01400000000000ul, 0x08800000000000ul, PM_LSU0, 0 }, - [PM_LSU1] = { 0x00000000000000ul, 0x00010000000000ul, PM_LSU1, 40 }, - [PM_GPS] = { 0x00000000000000ul, 0x00000000000000ul, PM_GPS, 0 } -}; - -static unsigned char direct_marked_event[8] = { - (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ - (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ - (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */ - (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ - (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */ - (1<<3) | (1<<4) | (1<<5), - /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ - (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ - (1<<4), /* PMC8: PM_MRK_LSU_FIN */ -}; - -/* - * Returns 1 if event counts things relating to marked instructions - * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. - */ -static int p4_marked_instr_event(u64 event) -{ - int pmc, psel, unit, byte, bit; - unsigned int mask; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - psel = event & PM_PMCSEL_MSK; - if (pmc) { - if (direct_marked_event[pmc - 1] & (1 << psel)) - return 1; - if (psel == 0) /* add events */ - bit = (pmc <= 4)? pmc - 1: 8 - pmc; - else if (psel == 6) /* decode events */ - bit = 4; - else - return 0; - } else - bit = psel; - - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - mask = 0; - switch (unit) { - case PM_LSU1: - if (event & PM_LOWER_MSKS) - mask = 1 << 28; /* byte 7 bit 4 */ - else - mask = 6 << 24; /* byte 3 bits 1 and 2 */ - break; - case PM_LSU0: - /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */ - mask = 0x083dff00; - } - return (mask >> (byte * 8 + bit)) & 1; -} - -static int p4_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, byte, unit, lower, sh; - unsigned long mask = 0, value = 0; - int grp = -1; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 8) - return -1; - sh = (pmc - 1) * 2; - mask |= 2 << sh; - value |= 1 << sh; - grp = ((pmc - 1) >> 1) & 1; - } - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - if (unit) { - lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK; - - /* - * Bus events on bytes 0 and 2 can be counted - * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. - */ - if (!pmc) - grp = byte & 1; - - if (!p4_unitinfo[unit].unit) - return -1; - mask |= p4_unitinfo[unit].mask; - value |= p4_unitinfo[unit].value; - sh = p4_unitinfo[unit].lowerbit; - if (sh > 1) - value |= (unsigned long)lower << sh; - else if (lower != sh) - return -1; - unit = p4_unitinfo[unit].unit; - - /* Set byte lane select field */ - mask |= 0xfULL << (28 - 4 * byte); - value |= (unsigned long)unit << (28 - 4 * byte); - } - if (grp == 0) { - /* increment PMC1/2/5/6 field */ - mask |= 0x8000000000ull; - value |= 0x1000000000ull; - } else { - /* increment PMC3/4/7/8 field */ - mask |= 0x800000000ull; - value |= 0x100000000ull; - } - - /* Marked instruction events need sample_enable set */ - if (p4_marked_instr_event(event)) { - mask |= 1ull << 56; - value |= 1ull << 56; - } - - /* PMCSEL=6 decode events on byte 2 need sample_enable clear */ - if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2) - mask |= 1ull << 56; - - *maskp = mask; - *valp = value; - return 0; -} - -static unsigned int ppc_inst_cmpl[] = { - 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 -}; - -static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - int i, j, na; - - alt[0] = event; - na = 1; - - /* 2 possibilities for PM_GRP_DISP_REJECT */ - if (event == 0x8003 || event == 0x0224) { - alt[1] = event ^ (0x8003 ^ 0x0224); - return 2; - } - - /* 2 possibilities for PM_ST_MISS_L1 */ - if (event == 0x0c13 || event == 0x0c23) { - alt[1] = event ^ (0x0c13 ^ 0x0c23); - return 2; - } - - /* several possibilities for PM_INST_CMPL */ - for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) { - if (event == ppc_inst_cmpl[i]) { - for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j) - if (j != i) - alt[na++] = ppc_inst_cmpl[j]; - break; - } - } - - return na; -} - -static int p4_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0; - unsigned int pmc, unit, byte, psel, lower; - unsigned int ttm, grp; - unsigned int pmc_inuse = 0; - unsigned int pmc_grp_use[2]; - unsigned char busbyte[4]; - unsigned char unituse[16]; - unsigned int unitlower = 0; - int i; - - if (n_ev > 8) - return -1; - - /* First pass to count resource use */ - pmc_grp_use[0] = pmc_grp_use[1] = 0; - memset(busbyte, 0, sizeof(busbyte)); - memset(unituse, 0, sizeof(unituse)); - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc_inuse & (1 << (pmc - 1))) - return -1; - pmc_inuse |= 1 << (pmc - 1); - /* count 1/2/5/6 vs 3/4/7/8 use */ - ++pmc_grp_use[((pmc - 1) >> 1) & 1]; - } - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK; - if (unit) { - if (!pmc) - ++pmc_grp_use[byte & 1]; - if (unit == 6 || unit == 8) - /* map alt ISU1/IFU codes: 6->2, 8->3 */ - unit = (unit >> 1) - 1; - if (busbyte[byte] && busbyte[byte] != unit) - return -1; - busbyte[byte] = unit; - lower <<= unit; - if (unituse[unit] && lower != (unitlower & lower)) - return -1; - unituse[unit] = 1; - unitlower |= lower; - } - } - if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) - return -1; - - /* - * Assign resources and set multiplexer selects. - * - * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2. - * Each TTMx can only select one unit, but since - * units 2 and 6 are both ISU1, and 3 and 8 are both IFU, - * we have some choices. - */ - if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) { - unituse[6] = 1; /* Move 2 to 6 */ - unituse[2] = 0; - } - if (unituse[3] & (unituse[1] | unituse[2])) { - unituse[8] = 1; /* Move 3 to 8 */ - unituse[3] = 0; - unitlower = (unitlower & ~8) | ((unitlower & 8) << 5); - } - /* Check only one unit per TTMx */ - if (unituse[1] + unituse[2] + unituse[3] > 1 || - unituse[4] + unituse[6] + unituse[7] > 1 || - unituse[8] + unituse[9] > 1 || - (unituse[5] | unituse[10] | unituse[11] | - unituse[13] | unituse[14])) - return -1; - - /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */ - mmcr1 |= (unsigned long)(unituse[3] * 2 + unituse[2]) - << MMCR1_TTM0SEL_SH; - mmcr1 |= (unsigned long)(unituse[7] * 3 + unituse[6] * 2) - << MMCR1_TTM1SEL_SH; - mmcr1 |= (unsigned long)unituse[9] << MMCR1_TTM2SEL_SH; - - /* Set TTCxSEL fields. */ - if (unitlower & 0xe) - mmcr1 |= 1ull << MMCR1_TTC0SEL_SH; - if (unitlower & 0xf0) - mmcr1 |= 1ull << MMCR1_TTC1SEL_SH; - if (unitlower & 0xf00) - mmcr1 |= 1ull << MMCR1_TTC2SEL_SH; - if (unitlower & 0x7000) - mmcr1 |= 1ull << MMCR1_TTC3SEL_SH; - - /* Set byte lane select fields. */ - for (byte = 0; byte < 4; ++byte) { - unit = busbyte[byte]; - if (!unit) - continue; - if (unit == 0xf) { - /* special case for GPS */ - mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte); - } else { - if (!unituse[unit]) - ttm = unit - 1; /* 2->1, 3->2 */ - else - ttm = unit >> 2; - mmcr1 |= (unsigned long)ttm - << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); - } - } - - /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - psel = event[i] & PM_PMCSEL_MSK; - if (!pmc) { - /* Bus event or 00xxx direct event (off or cycles) */ - if (unit) - psel |= 0x10 | ((byte & 2) << 2); - for (pmc = 0; pmc < 8; ++pmc) { - if (pmc_inuse & (1 << pmc)) - continue; - grp = (pmc >> 1) & 1; - if (unit) { - if (grp == (byte & 1)) - break; - } else if (pmc_grp_use[grp] < 4) { - ++pmc_grp_use[grp]; - break; - } - } - pmc_inuse |= 1 << pmc; - } else { - /* Direct event */ - --pmc; - if (psel == 0 && (byte & 2)) - /* add events on higher-numbered bus */ - mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; - else if (psel == 6 && byte == 3) - /* seem to need to set sample_enable here */ - mmcra |= MMCRA_SAMPLE_ENABLE; - psel |= 8; - } - if (pmc <= 1) - mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc); - else - mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); - if (pmc == 7) /* PMC8 */ - mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH; - hwc[i] = pmc; - if (p4_marked_instr_event(event[i])) - mmcra |= MMCRA_SAMPLE_ENABLE; - } - - if (pmc_inuse & 1) - mmcr0 |= MMCR0_PMC1CE; - if (pmc_inuse & 0xfe) - mmcr0 |= MMCR0_PMCjCE; - - mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ - - /* Return MMCRx values */ - mmcr[0] = mmcr0; - mmcr[1] = mmcr1; - mmcr[2] = mmcra; - return 0; -} - -static void p4_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - /* - * Setting the PMCxSEL field to 0 disables PMC x. - * (Note that pmc is 0-based here, not 1-based.) - */ - if (pmc <= 1) { - mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc)); - } else { - mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2))); - if (pmc == 7) - mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH); - } -} - -static int p4_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 7, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x1001, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ - [PERF_COUNT_HW_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x8c10, 0x3c10 }, - [C(OP_WRITE)] = { 0x7c10, 0xc13 }, - [C(OP_PREFETCH)] = { 0xc35, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { 0, 0 }, - [C(OP_PREFETCH)] = { 0xc34, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x904 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x900 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x330, 0x331 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static struct power_pmu power4_pmu = { - .name = "POWER4/4+", - .n_counter = 8, - .max_alternatives = 5, - .add_fields = 0x0000001100005555ul, - .test_adder = 0x0011083300000000ul, - .compute_mmcr = p4_compute_mmcr, - .get_constraint = p4_get_constraint, - .get_alternatives = p4_get_alternatives, - .disable_pmc = p4_disable_pmc, - .n_generic = ARRAY_SIZE(p4_generic_events), - .generic_events = p4_generic_events, - .cache_events = &power4_cache_events, -}; - -static int __init init_power4_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power4")) - return -ENODEV; - - return register_power_pmu(&power4_pmu); -} - -early_initcall(init_power4_pmu); diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c deleted file mode 100644 index a8757baa28f3..000000000000 --- a/arch/powerpc/kernel/power5+-pmu.c +++ /dev/null @@ -1,690 +0,0 @@ -/* - * Performance counter support for POWER5+/++ (not POWER5) processors. - * - * Copyright 2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - -/* - * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) - */ -#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ -#define PM_PMC_MSK 0xf -#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) -#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ -#define PM_UNIT_MSK 0xf -#define PM_BYTE_SH 12 /* Byte number of event bus to use */ -#define PM_BYTE_MSK 7 -#define PM_GRS_SH 8 /* Storage subsystem mux select */ -#define PM_GRS_MSK 7 -#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ -#define PM_PMCSEL_MSK 0x7f - -/* Values in PM_UNIT field */ -#define PM_FPU 0 -#define PM_ISU0 1 -#define PM_IFU 2 -#define PM_ISU1 3 -#define PM_IDU 4 -#define PM_ISU0_ALT 6 -#define PM_GRS 7 -#define PM_LSU0 8 -#define PM_LSU1 0xc -#define PM_LASTUNIT 0xc - -/* - * Bits in MMCR1 for POWER5+ - */ -#define MMCR1_TTM0SEL_SH 62 -#define MMCR1_TTM1SEL_SH 60 -#define MMCR1_TTM2SEL_SH 58 -#define MMCR1_TTM3SEL_SH 56 -#define MMCR1_TTMSEL_MSK 3 -#define MMCR1_TD_CP_DBG0SEL_SH 54 -#define MMCR1_TD_CP_DBG1SEL_SH 52 -#define MMCR1_TD_CP_DBG2SEL_SH 50 -#define MMCR1_TD_CP_DBG3SEL_SH 48 -#define MMCR1_GRS_L2SEL_SH 46 -#define MMCR1_GRS_L2SEL_MSK 3 -#define MMCR1_GRS_L3SEL_SH 44 -#define MMCR1_GRS_L3SEL_MSK 3 -#define MMCR1_GRS_MCSEL_SH 41 -#define MMCR1_GRS_MCSEL_MSK 7 -#define MMCR1_GRS_FABSEL_SH 39 -#define MMCR1_GRS_FABSEL_MSK 3 -#define MMCR1_PMC1_ADDER_SEL_SH 35 -#define MMCR1_PMC2_ADDER_SEL_SH 34 -#define MMCR1_PMC3_ADDER_SEL_SH 33 -#define MMCR1_PMC4_ADDER_SEL_SH 32 -#define MMCR1_PMC1SEL_SH 25 -#define MMCR1_PMC2SEL_SH 17 -#define MMCR1_PMC3SEL_SH 9 -#define MMCR1_PMC4SEL_SH 1 -#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) -#define MMCR1_PMCSEL_MSK 0x7f - -/* - * Layout of constraint bits: - * 6666555555555544444444443333333333222222222211111111110000000000 - * 3210987654321098765432109876543210987654321098765432109876543210 - * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><> - * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1 - * - * NC - number of counters - * 51: NC error 0x0008_0000_0000_0000 - * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 - * - * G0..G3 - GRS mux constraints - * 46-47: GRS_L2SEL value - * 44-45: GRS_L3SEL value - * 41-44: GRS_MCSEL value - * 39-40: GRS_FABSEL value - * Note that these match up with their bit positions in MMCR1 - * - * T0 - TTM0 constraint - * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000 - * - * T1 - TTM1 constraint - * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000 - * - * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS - * 33: UC3 error 0x02_0000_0000 - * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000 - * 31: ISU0 events needed 0x01_8000_0000 - * 30: IDU|GRS events needed 0x00_4000_0000 - * - * B0 - * 24-27: Byte 0 event source 0x0f00_0000 - * Encoding as for the event code - * - * B1, B2, B3 - * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources - * - * P6 - * 11: P6 error 0x800 - * 10-11: Count of events needing PMC6 - * - * P1..P5 - * 0-9: Count of events needing PMC1..PMC5 - */ - -static const int grsel_shift[8] = { - MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, - MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, - MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH -}; - -/* Masks and values for using events from the various units */ -static unsigned long unit_cons[PM_LASTUNIT+1][2] = { - [PM_FPU] = { 0x3200000000ul, 0x0100000000ul }, - [PM_ISU0] = { 0x0200000000ul, 0x0080000000ul }, - [PM_ISU1] = { 0x3200000000ul, 0x3100000000ul }, - [PM_IFU] = { 0x3200000000ul, 0x2100000000ul }, - [PM_IDU] = { 0x0e00000000ul, 0x0040000000ul }, - [PM_GRS] = { 0x0e00000000ul, 0x0c40000000ul }, -}; - -static int power5p_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, byte, unit, sh; - int bit, fmask; - unsigned long mask = 0, value = 0; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 6) - return -1; - sh = (pmc - 1) * 2; - mask |= 2 << sh; - value |= 1 << sh; - if (pmc >= 5 && !(event == 0x500009 || event == 0x600005)) - return -1; - } - if (event & PM_BUSEVENT_MSK) { - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - if (unit > PM_LASTUNIT) - return -1; - if (unit == PM_ISU0_ALT) - unit = PM_ISU0; - mask |= unit_cons[unit][0]; - value |= unit_cons[unit][1]; - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - if (byte >= 4) { - if (unit != PM_LSU1) - return -1; - /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ - ++unit; - byte &= 3; - } - if (unit == PM_GRS) { - bit = event & 7; - fmask = (bit == 6)? 7: 3; - sh = grsel_shift[bit]; - mask |= (unsigned long)fmask << sh; - value |= (unsigned long)((event >> PM_GRS_SH) & fmask) - << sh; - } - /* Set byte lane select field */ - mask |= 0xfUL << (24 - 4 * byte); - value |= (unsigned long)unit << (24 - 4 * byte); - } - if (pmc < 5) { - /* need a counter from PMC1-4 set */ - mask |= 0x8000000000000ul; - value |= 0x1000000000000ul; - } - *maskp = mask; - *valp = value; - return 0; -} - -static int power5p_limited_pmc_event(u64 event) -{ - int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - - return pmc == 5 || pmc == 6; -} - -#define MAX_ALT 3 /* at most 3 alternatives for any event */ - -static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */ - { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ - { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */ - { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */ - { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ - { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ - { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ - { 0x100005, 0x600005 }, /* PM_RUN_CYC */ - { 0x100009, 0x200009 }, /* PM_INST_CMPL */ - { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ - { 0x300009, 0x400009 }, /* PM_INST_DISP */ -}; - -/* - * Scan the alternatives table for a match and return the - * index into the alternatives table if found, else -1. - */ -static int find_alternative(unsigned int event) -{ - int i, j; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - break; - for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) - if (event == event_alternatives[i][j]) - return i; - } - return -1; -} - -static const unsigned char bytedecode_alternatives[4][4] = { - /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, - /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, - /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, - /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } -}; - -/* - * Some direct events for decodes of event bus byte 3 have alternative - * PMCSEL values on other counters. This returns the alternative - * event code for those that do, or -1 otherwise. This also handles - * alternative PCMSEL values for add events. - */ -static s64 find_alternative_bdecode(u64 event) -{ - int pmc, altpmc, pp, j; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc == 0 || pmc > 4) - return -1; - altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ - pp = event & PM_PMCSEL_MSK; - for (j = 0; j < 4; ++j) { - if (bytedecode_alternatives[pmc - 1][j] == pp) { - return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | - (altpmc << PM_PMC_SH) | - bytedecode_alternatives[altpmc - 1][j]; - } - } - - /* new decode alternatives for power5+ */ - if (pmc == 1 && (pp == 0x0d || pp == 0x0e)) - return event + (2 << PM_PMC_SH) + (0x2e - 0x0d); - if (pmc == 3 && (pp == 0x2e || pp == 0x2f)) - return event - (2 << PM_PMC_SH) - (0x2e - 0x0d); - - /* alternative add event encodings */ - if (pp == 0x10 || pp == 0x28) - return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) | - (altpmc << PM_PMC_SH); - - return -1; -} - -static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - int i, j, nalt = 1; - int nlim; - s64 ae; - - alt[0] = event; - nalt = 1; - nlim = power5p_limited_pmc_event(event); - i = find_alternative(event); - if (i >= 0) { - for (j = 0; j < MAX_ALT; ++j) { - ae = event_alternatives[i][j]; - if (ae && ae != event) - alt[nalt++] = ae; - nlim += power5p_limited_pmc_event(ae); - } - } else { - ae = find_alternative_bdecode(event); - if (ae > 0) - alt[nalt++] = ae; - } - - if (flags & PPMU_ONLY_COUNT_RUN) { - /* - * We're only counting in RUN state, - * so PM_CYC is equivalent to PM_RUN_CYC - * and PM_INST_CMPL === PM_RUN_INST_CMPL. - * This doesn't include alternatives that don't provide - * any extra flexibility in assigning PMCs (e.g. - * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC). - * Note that even with these additional alternatives - * we never end up with more than 3 alternatives for any event. - */ - j = nalt; - for (i = 0; i < nalt; ++i) { - switch (alt[i]) { - case 0xf: /* PM_CYC */ - alt[j++] = 0x600005; /* PM_RUN_CYC */ - ++nlim; - break; - case 0x600005: /* PM_RUN_CYC */ - alt[j++] = 0xf; - break; - case 0x100009: /* PM_INST_CMPL */ - alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */ - ++nlim; - break; - case 0x500009: /* PM_RUN_INST_CMPL */ - alt[j++] = 0x100009; /* PM_INST_CMPL */ - alt[j++] = 0x200009; - break; - } - } - nalt = j; - } - - if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { - /* remove the limited PMC events */ - j = 0; - for (i = 0; i < nalt; ++i) { - if (!power5p_limited_pmc_event(alt[i])) { - alt[j] = alt[i]; - ++j; - } - } - nalt = j; - } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { - /* remove all but the limited PMC events */ - j = 0; - for (i = 0; i < nalt; ++i) { - if (power5p_limited_pmc_event(alt[i])) { - alt[j] = alt[i]; - ++j; - } - } - nalt = j; - } - - return nalt; -} - -/* - * Map of which direct events on which PMCs are marked instruction events. - * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. - * Bit 0 is set if it is marked for all PMCs. - * The 0x80 bit indicates a byte decode PMCSEL value. - */ -static unsigned char direct_event_is_marked[0x28] = { - 0, /* 00 */ - 0x1f, /* 01 PM_IOPS_CMPL */ - 0x2, /* 02 PM_MRK_GRP_DISP */ - 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ - 0, /* 04 */ - 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ - 0x80, /* 06 */ - 0x80, /* 07 */ - 0, 0, 0,/* 08 - 0a */ - 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ - 0, /* 0c */ - 0x80, /* 0d */ - 0x80, /* 0e */ - 0, /* 0f */ - 0, /* 10 */ - 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ - 0, /* 12 */ - 0x10, /* 13 PM_MRK_GRP_CMPL */ - 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ - 0x2, /* 15 PM_MRK_GRP_ISSUED */ - 0x80, /* 16 */ - 0x80, /* 17 */ - 0, 0, 0, 0, 0, - 0x80, /* 1d */ - 0x80, /* 1e */ - 0, /* 1f */ - 0x80, /* 20 */ - 0x80, /* 21 */ - 0x80, /* 22 */ - 0x80, /* 23 */ - 0x80, /* 24 */ - 0x80, /* 25 */ - 0x80, /* 26 */ - 0x80, /* 27 */ -}; - -/* - * Returns 1 if event counts things relating to marked instructions - * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. - */ -static int power5p_marked_instr_event(u64 event) -{ - int pmc, psel; - int bit, byte, unit; - u32 mask; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - psel = event & PM_PMCSEL_MSK; - if (pmc >= 5) - return 0; - - bit = -1; - if (psel < sizeof(direct_event_is_marked)) { - if (direct_event_is_marked[psel] & (1 << pmc)) - return 1; - if (direct_event_is_marked[psel] & 0x80) - bit = 4; - else if (psel == 0x08) - bit = pmc - 1; - else if (psel == 0x10) - bit = 4 - pmc; - else if (psel == 0x1b && (pmc == 1 || pmc == 3)) - bit = 4; - } else if ((psel & 0x48) == 0x40) { - bit = psel & 7; - } else if (psel == 0x28) { - bit = pmc - 1; - } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) { - bit = 4; - } - - if (!(event & PM_BUSEVENT_MSK) || bit == -1) - return 0; - - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - if (unit == PM_LSU0) { - /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ - mask = 0x5dff00; - } else if (unit == PM_LSU1 && byte >= 4) { - byte -= 4; - /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */ - mask = 0x5f11c000; - } else - return 0; - - return (mask >> (byte * 8 + bit)) & 1; -} - -static int power5p_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - unsigned long mmcr1 = 0; - unsigned long mmcra = 0; - unsigned int pmc, unit, byte, psel; - unsigned int ttm; - int i, isbus, bit, grsel; - unsigned int pmc_inuse = 0; - unsigned char busbyte[4]; - unsigned char unituse[16]; - int ttmuse; - - if (n_ev > 6) - return -1; - - /* First pass to count resource use */ - memset(busbyte, 0, sizeof(busbyte)); - memset(unituse, 0, sizeof(unituse)); - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 6) - return -1; - if (pmc_inuse & (1 << (pmc - 1))) - return -1; - pmc_inuse |= 1 << (pmc - 1); - } - if (event[i] & PM_BUSEVENT_MSK) { - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - if (unit > PM_LASTUNIT) - return -1; - if (unit == PM_ISU0_ALT) - unit = PM_ISU0; - if (byte >= 4) { - if (unit != PM_LSU1) - return -1; - ++unit; - byte &= 3; - } - if (busbyte[byte] && busbyte[byte] != unit) - return -1; - busbyte[byte] = unit; - unituse[unit] = 1; - } - } - - /* - * Assign resources and set multiplexer selects. - * - * PM_ISU0 can go either on TTM0 or TTM1, but that's the only - * choice we have to deal with. - */ - if (unituse[PM_ISU0] & - (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { - unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ - unituse[PM_ISU0] = 0; - } - /* Set TTM[01]SEL fields. */ - ttmuse = 0; - for (i = PM_FPU; i <= PM_ISU1; ++i) { - if (!unituse[i]) - continue; - if (ttmuse++) - return -1; - mmcr1 |= (unsigned long)i << MMCR1_TTM0SEL_SH; - } - ttmuse = 0; - for (; i <= PM_GRS; ++i) { - if (!unituse[i]) - continue; - if (ttmuse++) - return -1; - mmcr1 |= (unsigned long)(i & 3) << MMCR1_TTM1SEL_SH; - } - if (ttmuse > 1) - return -1; - - /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ - for (byte = 0; byte < 4; ++byte) { - unit = busbyte[byte]; - if (!unit) - continue; - if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { - /* get ISU0 through TTM1 rather than TTM0 */ - unit = PM_ISU0_ALT; - } else if (unit == PM_LSU1 + 1) { - /* select lower word of LSU1 for this byte */ - mmcr1 |= 1ul << (MMCR1_TTM3SEL_SH + 3 - byte); - } - ttm = unit >> 2; - mmcr1 |= (unsigned long)ttm - << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); - } - - /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - psel = event[i] & PM_PMCSEL_MSK; - isbus = event[i] & PM_BUSEVENT_MSK; - if (!pmc) { - /* Bus event or any-PMC direct event */ - for (pmc = 0; pmc < 4; ++pmc) { - if (!(pmc_inuse & (1 << pmc))) - break; - } - if (pmc >= 4) - return -1; - pmc_inuse |= 1 << pmc; - } else if (pmc <= 4) { - /* Direct event */ - --pmc; - if (isbus && (byte & 2) && - (psel == 8 || psel == 0x10 || psel == 0x28)) - /* add events on higher-numbered bus */ - mmcr1 |= 1ul << (MMCR1_PMC1_ADDER_SEL_SH - pmc); - } else { - /* Instructions or run cycles on PMC5/6 */ - --pmc; - } - if (isbus && unit == PM_GRS) { - bit = psel & 7; - grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; - mmcr1 |= (unsigned long)grsel << grsel_shift[bit]; - } - if (power5p_marked_instr_event(event[i])) - mmcra |= MMCRA_SAMPLE_ENABLE; - if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) - /* select alternate byte lane */ - psel |= 0x10; - if (pmc <= 3) - mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); - hwc[i] = pmc; - } - - /* Return MMCRx values */ - mmcr[0] = 0; - if (pmc_inuse & 1) - mmcr[0] = MMCR0_PMC1CE; - if (pmc_inuse & 0x3e) - mmcr[0] |= MMCR0_PMCjCE; - mmcr[1] = mmcr1; - mmcr[2] = mmcra; - return 0; -} - -static void power5p_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - if (pmc <= 3) - mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); -} - -static int power5p_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0xf, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ - [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x1c10a8, 0x3c1088 }, - [C(OP_WRITE)] = { 0x2c10a8, 0xc10c3 }, - [C(OP_PREFETCH)] = { 0xc70e7, -1 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { 0, 0 }, - [C(OP_PREFETCH)] = { 0xc50c3, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0xc20e4, 0x800c4 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x800c0 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x230e4, 0x230e5 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static struct power_pmu power5p_pmu = { - .name = "POWER5+/++", - .n_counter = 6, - .max_alternatives = MAX_ALT, - .add_fields = 0x7000000000055ul, - .test_adder = 0x3000040000000ul, - .compute_mmcr = power5p_compute_mmcr, - .get_constraint = power5p_get_constraint, - .get_alternatives = power5p_get_alternatives, - .disable_pmc = power5p_disable_pmc, - .limited_pmc_event = power5p_limited_pmc_event, - .flags = PPMU_LIMITED_PMC5_6, - .n_generic = ARRAY_SIZE(power5p_generic_events), - .generic_events = power5p_generic_events, - .cache_events = &power5p_cache_events, -}; - -static int __init init_power5p_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+") - && strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5++"))) - return -ENODEV; - - return register_power_pmu(&power5p_pmu); -} - -early_initcall(init_power5p_pmu); diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c deleted file mode 100644 index e7f06eb7a861..000000000000 --- a/arch/powerpc/kernel/power5-pmu.c +++ /dev/null @@ -1,629 +0,0 @@ -/* - * Performance counter support for POWER5 (not POWER5++) processors. - * - * Copyright 2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - -/* - * Bits in event code for POWER5 (not POWER5++) - */ -#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ -#define PM_PMC_MSK 0xf -#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) -#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ -#define PM_UNIT_MSK 0xf -#define PM_BYTE_SH 12 /* Byte number of event bus to use */ -#define PM_BYTE_MSK 7 -#define PM_GRS_SH 8 /* Storage subsystem mux select */ -#define PM_GRS_MSK 7 -#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ -#define PM_PMCSEL_MSK 0x7f - -/* Values in PM_UNIT field */ -#define PM_FPU 0 -#define PM_ISU0 1 -#define PM_IFU 2 -#define PM_ISU1 3 -#define PM_IDU 4 -#define PM_ISU0_ALT 6 -#define PM_GRS 7 -#define PM_LSU0 8 -#define PM_LSU1 0xc -#define PM_LASTUNIT 0xc - -/* - * Bits in MMCR1 for POWER5 - */ -#define MMCR1_TTM0SEL_SH 62 -#define MMCR1_TTM1SEL_SH 60 -#define MMCR1_TTM2SEL_SH 58 -#define MMCR1_TTM3SEL_SH 56 -#define MMCR1_TTMSEL_MSK 3 -#define MMCR1_TD_CP_DBG0SEL_SH 54 -#define MMCR1_TD_CP_DBG1SEL_SH 52 -#define MMCR1_TD_CP_DBG2SEL_SH 50 -#define MMCR1_TD_CP_DBG3SEL_SH 48 -#define MMCR1_GRS_L2SEL_SH 46 -#define MMCR1_GRS_L2SEL_MSK 3 -#define MMCR1_GRS_L3SEL_SH 44 -#define MMCR1_GRS_L3SEL_MSK 3 -#define MMCR1_GRS_MCSEL_SH 41 -#define MMCR1_GRS_MCSEL_MSK 7 -#define MMCR1_GRS_FABSEL_SH 39 -#define MMCR1_GRS_FABSEL_MSK 3 -#define MMCR1_PMC1_ADDER_SEL_SH 35 -#define MMCR1_PMC2_ADDER_SEL_SH 34 -#define MMCR1_PMC3_ADDER_SEL_SH 33 -#define MMCR1_PMC4_ADDER_SEL_SH 32 -#define MMCR1_PMC1SEL_SH 25 -#define MMCR1_PMC2SEL_SH 17 -#define MMCR1_PMC3SEL_SH 9 -#define MMCR1_PMC4SEL_SH 1 -#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) -#define MMCR1_PMCSEL_MSK 0x7f - -/* - * Layout of constraint bits: - * 6666555555555544444444443333333333222222222211111111110000000000 - * 3210987654321098765432109876543210987654321098765432109876543210 - * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><> - * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1 - * - * T0 - TTM0 constraint - * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000 - * - * T1 - TTM1 constraint - * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000 - * - * NC - number of counters - * 51: NC error 0x0008_0000_0000_0000 - * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 - * - * G0..G3 - GRS mux constraints - * 46-47: GRS_L2SEL value - * 44-45: GRS_L3SEL value - * 41-44: GRS_MCSEL value - * 39-40: GRS_FABSEL value - * Note that these match up with their bit positions in MMCR1 - * - * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS - * 37: UC3 error 0x20_0000_0000 - * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000 - * 35: ISU0 events needed 0x08_0000_0000 - * 34: IDU|GRS events needed 0x04_0000_0000 - * - * PS1 - * 33: PS1 error 0x2_0000_0000 - * 31-32: count of events needing PMC1/2 0x1_8000_0000 - * - * PS2 - * 30: PS2 error 0x4000_0000 - * 28-29: count of events needing PMC3/4 0x3000_0000 - * - * B0 - * 24-27: Byte 0 event source 0x0f00_0000 - * Encoding as for the event code - * - * B1, B2, B3 - * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources - * - * P1..P6 - * 0-11: Count of events needing PMC1..PMC6 - */ - -static const int grsel_shift[8] = { - MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, - MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, - MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH -}; - -/* Masks and values for using events from the various units */ -static unsigned long unit_cons[PM_LASTUNIT+1][2] = { - [PM_FPU] = { 0xc0002000000000ul, 0x00001000000000ul }, - [PM_ISU0] = { 0x00002000000000ul, 0x00000800000000ul }, - [PM_ISU1] = { 0xc0002000000000ul, 0xc0001000000000ul }, - [PM_IFU] = { 0xc0002000000000ul, 0x80001000000000ul }, - [PM_IDU] = { 0x30002000000000ul, 0x00000400000000ul }, - [PM_GRS] = { 0x30002000000000ul, 0x30000400000000ul }, -}; - -static int power5_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, byte, unit, sh; - int bit, fmask; - unsigned long mask = 0, value = 0; - int grp = -1; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 6) - return -1; - sh = (pmc - 1) * 2; - mask |= 2 << sh; - value |= 1 << sh; - if (pmc <= 4) - grp = (pmc - 1) >> 1; - else if (event != 0x500009 && event != 0x600005) - return -1; - } - if (event & PM_BUSEVENT_MSK) { - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - if (unit > PM_LASTUNIT) - return -1; - if (unit == PM_ISU0_ALT) - unit = PM_ISU0; - mask |= unit_cons[unit][0]; - value |= unit_cons[unit][1]; - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - if (byte >= 4) { - if (unit != PM_LSU1) - return -1; - /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ - ++unit; - byte &= 3; - } - if (unit == PM_GRS) { - bit = event & 7; - fmask = (bit == 6)? 7: 3; - sh = grsel_shift[bit]; - mask |= (unsigned long)fmask << sh; - value |= (unsigned long)((event >> PM_GRS_SH) & fmask) - << sh; - } - /* - * Bus events on bytes 0 and 2 can be counted - * on PMC1/2; bytes 1 and 3 on PMC3/4. - */ - if (!pmc) - grp = byte & 1; - /* Set byte lane select field */ - mask |= 0xfUL << (24 - 4 * byte); - value |= (unsigned long)unit << (24 - 4 * byte); - } - if (grp == 0) { - /* increment PMC1/2 field */ - mask |= 0x200000000ul; - value |= 0x080000000ul; - } else if (grp == 1) { - /* increment PMC3/4 field */ - mask |= 0x40000000ul; - value |= 0x10000000ul; - } - if (pmc < 5) { - /* need a counter from PMC1-4 set */ - mask |= 0x8000000000000ul; - value |= 0x1000000000000ul; - } - *maskp = mask; - *valp = value; - return 0; -} - -#define MAX_ALT 3 /* at most 3 alternatives for any event */ - -static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ - { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ - { 0x100005, 0x600005 }, /* PM_RUN_CYC */ - { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */ - { 0x300009, 0x400009 }, /* PM_INST_DISP */ -}; - -/* - * Scan the alternatives table for a match and return the - * index into the alternatives table if found, else -1. - */ -static int find_alternative(u64 event) -{ - int i, j; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - break; - for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) - if (event == event_alternatives[i][j]) - return i; - } - return -1; -} - -static const unsigned char bytedecode_alternatives[4][4] = { - /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, - /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, - /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, - /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } -}; - -/* - * Some direct events for decodes of event bus byte 3 have alternative - * PMCSEL values on other counters. This returns the alternative - * event code for those that do, or -1 otherwise. - */ -static s64 find_alternative_bdecode(u64 event) -{ - int pmc, altpmc, pp, j; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc == 0 || pmc > 4) - return -1; - altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ - pp = event & PM_PMCSEL_MSK; - for (j = 0; j < 4; ++j) { - if (bytedecode_alternatives[pmc - 1][j] == pp) { - return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | - (altpmc << PM_PMC_SH) | - bytedecode_alternatives[altpmc - 1][j]; - } - } - return -1; -} - -static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - int i, j, nalt = 1; - s64 ae; - - alt[0] = event; - nalt = 1; - i = find_alternative(event); - if (i >= 0) { - for (j = 0; j < MAX_ALT; ++j) { - ae = event_alternatives[i][j]; - if (ae && ae != event) - alt[nalt++] = ae; - } - } else { - ae = find_alternative_bdecode(event); - if (ae > 0) - alt[nalt++] = ae; - } - return nalt; -} - -/* - * Map of which direct events on which PMCs are marked instruction events. - * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. - * Bit 0 is set if it is marked for all PMCs. - * The 0x80 bit indicates a byte decode PMCSEL value. - */ -static unsigned char direct_event_is_marked[0x28] = { - 0, /* 00 */ - 0x1f, /* 01 PM_IOPS_CMPL */ - 0x2, /* 02 PM_MRK_GRP_DISP */ - 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ - 0, /* 04 */ - 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ - 0x80, /* 06 */ - 0x80, /* 07 */ - 0, 0, 0,/* 08 - 0a */ - 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ - 0, /* 0c */ - 0x80, /* 0d */ - 0x80, /* 0e */ - 0, /* 0f */ - 0, /* 10 */ - 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ - 0, /* 12 */ - 0x10, /* 13 PM_MRK_GRP_CMPL */ - 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ - 0x2, /* 15 PM_MRK_GRP_ISSUED */ - 0x80, /* 16 */ - 0x80, /* 17 */ - 0, 0, 0, 0, 0, - 0x80, /* 1d */ - 0x80, /* 1e */ - 0, /* 1f */ - 0x80, /* 20 */ - 0x80, /* 21 */ - 0x80, /* 22 */ - 0x80, /* 23 */ - 0x80, /* 24 */ - 0x80, /* 25 */ - 0x80, /* 26 */ - 0x80, /* 27 */ -}; - -/* - * Returns 1 if event counts things relating to marked instructions - * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. - */ -static int power5_marked_instr_event(u64 event) -{ - int pmc, psel; - int bit, byte, unit; - u32 mask; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - psel = event & PM_PMCSEL_MSK; - if (pmc >= 5) - return 0; - - bit = -1; - if (psel < sizeof(direct_event_is_marked)) { - if (direct_event_is_marked[psel] & (1 << pmc)) - return 1; - if (direct_event_is_marked[psel] & 0x80) - bit = 4; - else if (psel == 0x08) - bit = pmc - 1; - else if (psel == 0x10) - bit = 4 - pmc; - else if (psel == 0x1b && (pmc == 1 || pmc == 3)) - bit = 4; - } else if ((psel & 0x58) == 0x40) - bit = psel & 7; - - if (!(event & PM_BUSEVENT_MSK)) - return 0; - - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - if (unit == PM_LSU0) { - /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ - mask = 0x5dff00; - } else if (unit == PM_LSU1 && byte >= 4) { - byte -= 4; - /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */ - mask = 0x5f00c0aa; - } else - return 0; - - return (mask >> (byte * 8 + bit)) & 1; -} - -static int power5_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - unsigned long mmcr1 = 0; - unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; - unsigned int pmc, unit, byte, psel; - unsigned int ttm, grp; - int i, isbus, bit, grsel; - unsigned int pmc_inuse = 0; - unsigned int pmc_grp_use[2]; - unsigned char busbyte[4]; - unsigned char unituse[16]; - int ttmuse; - - if (n_ev > 6) - return -1; - - /* First pass to count resource use */ - pmc_grp_use[0] = pmc_grp_use[1] = 0; - memset(busbyte, 0, sizeof(busbyte)); - memset(unituse, 0, sizeof(unituse)); - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 6) - return -1; - if (pmc_inuse & (1 << (pmc - 1))) - return -1; - pmc_inuse |= 1 << (pmc - 1); - /* count 1/2 vs 3/4 use */ - if (pmc <= 4) - ++pmc_grp_use[(pmc - 1) >> 1]; - } - if (event[i] & PM_BUSEVENT_MSK) { - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - if (unit > PM_LASTUNIT) - return -1; - if (unit == PM_ISU0_ALT) - unit = PM_ISU0; - if (byte >= 4) { - if (unit != PM_LSU1) - return -1; - ++unit; - byte &= 3; - } - if (!pmc) - ++pmc_grp_use[byte & 1]; - if (busbyte[byte] && busbyte[byte] != unit) - return -1; - busbyte[byte] = unit; - unituse[unit] = 1; - } - } - if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2) - return -1; - - /* - * Assign resources and set multiplexer selects. - * - * PM_ISU0 can go either on TTM0 or TTM1, but that's the only - * choice we have to deal with. - */ - if (unituse[PM_ISU0] & - (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { - unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ - unituse[PM_ISU0] = 0; - } - /* Set TTM[01]SEL fields. */ - ttmuse = 0; - for (i = PM_FPU; i <= PM_ISU1; ++i) { - if (!unituse[i]) - continue; - if (ttmuse++) - return -1; - mmcr1 |= (unsigned long)i << MMCR1_TTM0SEL_SH; - } - ttmuse = 0; - for (; i <= PM_GRS; ++i) { - if (!unituse[i]) - continue; - if (ttmuse++) - return -1; - mmcr1 |= (unsigned long)(i & 3) << MMCR1_TTM1SEL_SH; - } - if (ttmuse > 1) - return -1; - - /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ - for (byte = 0; byte < 4; ++byte) { - unit = busbyte[byte]; - if (!unit) - continue; - if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { - /* get ISU0 through TTM1 rather than TTM0 */ - unit = PM_ISU0_ALT; - } else if (unit == PM_LSU1 + 1) { - /* select lower word of LSU1 for this byte */ - mmcr1 |= 1ul << (MMCR1_TTM3SEL_SH + 3 - byte); - } - ttm = unit >> 2; - mmcr1 |= (unsigned long)ttm - << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); - } - - /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - psel = event[i] & PM_PMCSEL_MSK; - isbus = event[i] & PM_BUSEVENT_MSK; - if (!pmc) { - /* Bus event or any-PMC direct event */ - for (pmc = 0; pmc < 4; ++pmc) { - if (pmc_inuse & (1 << pmc)) - continue; - grp = (pmc >> 1) & 1; - if (isbus) { - if (grp == (byte & 1)) - break; - } else if (pmc_grp_use[grp] < 2) { - ++pmc_grp_use[grp]; - break; - } - } - pmc_inuse |= 1 << pmc; - } else if (pmc <= 4) { - /* Direct event */ - --pmc; - if ((psel == 8 || psel == 0x10) && isbus && (byte & 2)) - /* add events on higher-numbered bus */ - mmcr1 |= 1ul << (MMCR1_PMC1_ADDER_SEL_SH - pmc); - } else { - /* Instructions or run cycles on PMC5/6 */ - --pmc; - } - if (isbus && unit == PM_GRS) { - bit = psel & 7; - grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; - mmcr1 |= (unsigned long)grsel << grsel_shift[bit]; - } - if (power5_marked_instr_event(event[i])) - mmcra |= MMCRA_SAMPLE_ENABLE; - if (pmc <= 3) - mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); - hwc[i] = pmc; - } - - /* Return MMCRx values */ - mmcr[0] = 0; - if (pmc_inuse & 1) - mmcr[0] = MMCR0_PMC1CE; - if (pmc_inuse & 0x3e) - mmcr[0] |= MMCR0_PMCjCE; - mmcr[1] = mmcr1; - mmcr[2] = mmcra; - return 0; -} - -static void power5_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - if (pmc <= 3) - mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); -} - -static int power5_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0xf, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ - [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x4c1090, 0x3c1088 }, - [C(OP_WRITE)] = { 0x3c1090, 0xc10c3 }, - [C(OP_PREFETCH)] = { 0xc70e7, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x3c309b }, - [C(OP_WRITE)] = { 0, 0 }, - [C(OP_PREFETCH)] = { 0xc50c3, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x2c4090, 0x800c4 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x800c0 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x230e4, 0x230e5 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static struct power_pmu power5_pmu = { - .name = "POWER5", - .n_counter = 6, - .max_alternatives = MAX_ALT, - .add_fields = 0x7000090000555ul, - .test_adder = 0x3000490000000ul, - .compute_mmcr = power5_compute_mmcr, - .get_constraint = power5_get_constraint, - .get_alternatives = power5_get_alternatives, - .disable_pmc = power5_disable_pmc, - .n_generic = ARRAY_SIZE(power5_generic_events), - .generic_events = power5_generic_events, - .cache_events = &power5_cache_events, -}; - -static int __init init_power5_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5")) - return -ENODEV; - - return register_power_pmu(&power5_pmu); -} - -early_initcall(init_power5_pmu); diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c deleted file mode 100644 index 0bbc901e7efc..000000000000 --- a/arch/powerpc/kernel/power6-pmu.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * Performance counter support for POWER6 processors. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - -/* - * Bits in event code for POWER6 - */ -#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ -#define PM_PMC_MSK 0x7 -#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) -#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */ -#define PM_UNIT_MSK 0xf -#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH) -#define PM_LLAV 0x8000 /* Load lookahead match value */ -#define PM_LLA 0x4000 /* Load lookahead match enable */ -#define PM_BYTE_SH 12 /* Byte of event bus to use */ -#define PM_BYTE_MSK 3 -#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */ -#define PM_SUBUNIT_MSK 7 -#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH) -#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */ -#define PM_BUSEVENT_MSK 0xf3700 - -/* - * Bits in MMCR1 for POWER6 - */ -#define MMCR1_TTM0SEL_SH 60 -#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4) -#define MMCR1_TTMSEL_MSK 0xf -#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK) -#define MMCR1_NESTSEL_SH 45 -#define MMCR1_NESTSEL_MSK 0x7 -#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK) -#define MMCR1_PMC1_LLA (1ul << 44) -#define MMCR1_PMC1_LLA_VALUE (1ul << 39) -#define MMCR1_PMC1_ADDR_SEL (1ul << 35) -#define MMCR1_PMC1SEL_SH 24 -#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) -#define MMCR1_PMCSEL_MSK 0xff - -/* - * Map of which direct events on which PMCs are marked instruction events. - * Indexed by PMCSEL value >> 1. - * Bottom 4 bits are a map of which PMCs are interesting, - * top 4 bits say what sort of event: - * 0 = direct marked event, - * 1 = byte decode event, - * 4 = add/and event (PMC1 -> bits 0 & 4), - * 5 = add/and event (PMC1 -> bits 1 & 5), - * 6 = add/and event (PMC1 -> bits 2 & 6), - * 7 = add/and event (PMC1 -> bits 3 & 7). - */ -static unsigned char direct_event_is_marked[0x60 >> 1] = { - 0, /* 00 */ - 0, /* 02 */ - 0, /* 04 */ - 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ - 0x04, /* 08 PM_MRK_DFU_FIN */ - 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */ - 0, /* 0c */ - 0, /* 0e */ - 0x02, /* 10 PM_MRK_INST_DISP */ - 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */ - 0, /* 14 */ - 0, /* 16 */ - 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */ - 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */ - 0x01, /* 1c PM_MRK_INST_ISSUED */ - 0, /* 1e */ - 0, /* 20 */ - 0, /* 22 */ - 0, /* 24 */ - 0, /* 26 */ - 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */ - 0, /* 2a */ - 0, /* 2c */ - 0, /* 2e */ - 0x4f, /* 30 */ - 0x7f, /* 32 */ - 0x4f, /* 34 */ - 0x5f, /* 36 */ - 0x6f, /* 38 */ - 0x4f, /* 3a */ - 0, /* 3c */ - 0x08, /* 3e PM_MRK_INST_TIMEO */ - 0x1f, /* 40 */ - 0x1f, /* 42 */ - 0x1f, /* 44 */ - 0x1f, /* 46 */ - 0x1f, /* 48 */ - 0x1f, /* 4a */ - 0x1f, /* 4c */ - 0x1f, /* 4e */ - 0, /* 50 */ - 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */ - 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */ - 0x02, /* 56 PM_MRK_LD_MISS_L1 */ - 0, /* 58 */ - 0, /* 5a */ - 0, /* 5c */ - 0, /* 5e */ -}; - -/* - * Masks showing for each unit which bits are marked events. - * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0. - */ -static u32 marked_bus_events[16] = { - 0x01000000, /* direct events set 1: byte 3 bit 0 */ - 0x00010000, /* direct events set 2: byte 2 bit 0 */ - 0, 0, 0, 0, /* IDU, IFU, nest: nothing */ - 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */ - 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */ - 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */ - 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */ - 0, /* LSU set 3 */ - 0x00000010, /* VMX set 3: byte 0 bit 4 */ - 0, /* BFP set 1 */ - 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */ - 0, 0 -}; - -/* - * Returns 1 if event counts things relating to marked instructions - * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. - */ -static int power6_marked_instr_event(u64 event) -{ - int pmc, psel, ptype; - int bit, byte, unit; - u32 mask; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */ - if (pmc >= 5) - return 0; - - bit = -1; - if (psel < sizeof(direct_event_is_marked)) { - ptype = direct_event_is_marked[psel]; - if (pmc == 0 || !(ptype & (1 << (pmc - 1)))) - return 0; - ptype >>= 4; - if (ptype == 0) - return 1; - if (ptype == 1) - bit = 0; - else - bit = ptype ^ (pmc - 1); - } else if ((psel & 0x48) == 0x40) - bit = psel & 7; - - if (!(event & PM_BUSEVENT_MSK) || bit == -1) - return 0; - - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - mask = marked_bus_events[unit]; - return (mask >> (byte * 8 + bit)) & 1; -} - -/* - * Assign PMC numbers and compute MMCR1 value for a set of events - */ -static int p6_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - unsigned long mmcr1 = 0; - unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; - int i; - unsigned int pmc, ev, b, u, s, psel; - unsigned int ttmset = 0; - unsigned int pmc_inuse = 0; - - if (n_ev > 6) - return -1; - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc_inuse & (1 << (pmc - 1))) - return -1; /* collision! */ - pmc_inuse |= 1 << (pmc - 1); - } - } - for (i = 0; i < n_ev; ++i) { - ev = event[i]; - pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - --pmc; - } else { - /* can go on any PMC; find a free one */ - for (pmc = 0; pmc < 4; ++pmc) - if (!(pmc_inuse & (1 << pmc))) - break; - if (pmc >= 4) - return -1; - pmc_inuse |= 1 << pmc; - } - hwc[i] = pmc; - psel = ev & PM_PMCSEL_MSK; - if (ev & PM_BUSEVENT_MSK) { - /* this event uses the event bus */ - b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK; - u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK; - /* check for conflict on this byte of event bus */ - if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u) - return -1; - mmcr1 |= (unsigned long)u << MMCR1_TTMSEL_SH(b); - ttmset |= 1 << b; - if (u == 5) { - /* Nest events have a further mux */ - s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; - if ((ttmset & 0x10) && - MMCR1_NESTSEL(mmcr1) != s) - return -1; - ttmset |= 0x10; - mmcr1 |= (unsigned long)s << MMCR1_NESTSEL_SH; - } - if (0x30 <= psel && psel <= 0x3d) { - /* these need the PMCx_ADDR_SEL bits */ - if (b >= 2) - mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc; - } - /* bus select values are different for PMC3/4 */ - if (pmc >= 2 && (psel & 0x90) == 0x80) - psel ^= 0x20; - } - if (ev & PM_LLA) { - mmcr1 |= MMCR1_PMC1_LLA >> pmc; - if (ev & PM_LLAV) - mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; - } - if (power6_marked_instr_event(event[i])) - mmcra |= MMCRA_SAMPLE_ENABLE; - if (pmc < 4) - mmcr1 |= (unsigned long)psel << MMCR1_PMCSEL_SH(pmc); - } - mmcr[0] = 0; - if (pmc_inuse & 1) - mmcr[0] = MMCR0_PMC1CE; - if (pmc_inuse & 0xe) - mmcr[0] |= MMCR0_PMCjCE; - mmcr[1] = mmcr1; - mmcr[2] = mmcra; - return 0; -} - -/* - * Layout of constraint bits: - * - * 0-1 add field: number of uses of PMC1 (max 1) - * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6 - * 12-15 add field: number of uses of PMC1-4 (max 4) - * 16-19 select field: unit on byte 0 of event bus - * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 - * 32-34 select field: nest (subunit) event selector - */ -static int p6_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, byte, sh, subunit; - unsigned long mask = 0, value = 0; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 4 && !(event == 0x500009 || event == 0x600005)) - return -1; - sh = (pmc - 1) * 2; - mask |= 2 << sh; - value |= 1 << sh; - } - if (event & PM_BUSEVENT_MSK) { - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - sh = byte * 4 + (16 - PM_UNIT_SH); - mask |= PM_UNIT_MSKS << sh; - value |= (unsigned long)(event & PM_UNIT_MSKS) << sh; - if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { - subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; - mask |= (unsigned long)PM_SUBUNIT_MSK << 32; - value |= (unsigned long)subunit << 32; - } - } - if (pmc <= 4) { - mask |= 0x8000; /* add field for count of PMC1-4 uses */ - value |= 0x1000; - } - *maskp = mask; - *valp = value; - return 0; -} - -static int p6_limited_pmc_event(u64 event) -{ - int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - - return pmc == 5 || pmc == 6; -} - -#define MAX_ALT 4 /* at most 4 alternatives for any event */ - -static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ - { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ - { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ - { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */ - { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ - { 0x10000e, 0x400010 }, /* PM_PURR */ - { 0x100010, 0x4000f8 }, /* PM_FLUSH */ - { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */ - { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */ - { 0x100054, 0x2000f0 }, /* PM_ST_FIN */ - { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */ - { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */ - { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */ - { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */ - { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */ - { 0x200012, 0x300012 }, /* PM_INST_DISP */ - { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */ - { 0x2000f8, 0x300010 }, /* PM_EXT_INT */ - { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */ - { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */ - { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */ - { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */ - { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */ -}; - -/* - * This could be made more efficient with a binary search on - * a presorted list, if necessary - */ -static int find_alternatives_list(u64 event) -{ - int i, j; - unsigned int alt; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - return -1; - for (j = 0; j < MAX_ALT; ++j) { - alt = event_alternatives[i][j]; - if (!alt || event < alt) - break; - if (event == alt) - return i; - } - } - return -1; -} - -static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - int i, j, nlim; - unsigned int psel, pmc; - unsigned int nalt = 1; - u64 aevent; - - alt[0] = event; - nlim = p6_limited_pmc_event(event); - - /* check the alternatives table */ - i = find_alternatives_list(event); - if (i >= 0) { - /* copy out alternatives from list */ - for (j = 0; j < MAX_ALT; ++j) { - aevent = event_alternatives[i][j]; - if (!aevent) - break; - if (aevent != event) - alt[nalt++] = aevent; - nlim += p6_limited_pmc_event(aevent); - } - - } else { - /* Check for alternative ways of computing sum events */ - /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */ - psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */ - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc && (psel == 0x32 || psel == 0x34)) - alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) | - ((5 - pmc) << PM_PMC_SH); - - /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */ - if (pmc && (psel == 0x38 || psel == 0x3a)) - alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) | - ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); - } - - if (flags & PPMU_ONLY_COUNT_RUN) { - /* - * We're only counting in RUN state, - * so PM_CYC is equivalent to PM_RUN_CYC, - * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR. - * This doesn't include alternatives that don't provide - * any extra flexibility in assigning PMCs (e.g. - * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC). - * Note that even with these additional alternatives - * we never end up with more than 4 alternatives for any event. - */ - j = nalt; - for (i = 0; i < nalt; ++i) { - switch (alt[i]) { - case 0x1e: /* PM_CYC */ - alt[j++] = 0x600005; /* PM_RUN_CYC */ - ++nlim; - break; - case 0x10000a: /* PM_RUN_CYC */ - alt[j++] = 0x1e; /* PM_CYC */ - break; - case 2: /* PM_INST_CMPL */ - alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */ - ++nlim; - break; - case 0x500009: /* PM_RUN_INST_CMPL */ - alt[j++] = 2; /* PM_INST_CMPL */ - break; - case 0x10000e: /* PM_PURR */ - alt[j++] = 0x4000f4; /* PM_RUN_PURR */ - break; - case 0x4000f4: /* PM_RUN_PURR */ - alt[j++] = 0x10000e; /* PM_PURR */ - break; - } - } - nalt = j; - } - - if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { - /* remove the limited PMC events */ - j = 0; - for (i = 0; i < nalt; ++i) { - if (!p6_limited_pmc_event(alt[i])) { - alt[j] = alt[i]; - ++j; - } - } - nalt = j; - } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { - /* remove all but the limited PMC events */ - j = 0; - for (i = 0; i < nalt; ++i) { - if (p6_limited_pmc_event(alt[i])) { - alt[j] = alt[i]; - ++j; - } - } - nalt = j; - } - - return nalt; -} - -static void p6_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - /* Set PMCxSEL to 0 to disable PMCx */ - if (pmc <= 3) - mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); -} - -static int power6_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x1e, - [PERF_COUNT_HW_INSTRUCTIONS] = 2, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ - [PERF_COUNT_HW_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - * The "DTLB" and "ITLB" events relate to the DERAT and IERAT. - */ -static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x280030, 0x80080 }, - [C(OP_WRITE)] = { 0x180032, 0x80088 }, - [C(OP_PREFETCH)] = { 0x810a4, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x100056 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0x4008c, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x150730, 0x250532 }, - [C(OP_WRITE)] = { 0x250432, 0x150432 }, - [C(OP_PREFETCH)] = { 0x810a6, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x20000e }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x420ce }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x430e6, 0x400052 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static struct power_pmu power6_pmu = { - .name = "POWER6", - .n_counter = 6, - .max_alternatives = MAX_ALT, - .add_fields = 0x1555, - .test_adder = 0x3000, - .compute_mmcr = p6_compute_mmcr, - .get_constraint = p6_get_constraint, - .get_alternatives = p6_get_alternatives, - .disable_pmc = p6_disable_pmc, - .limited_pmc_event = p6_limited_pmc_event, - .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR, - .n_generic = ARRAY_SIZE(power6_generic_events), - .generic_events = power6_generic_events, - .cache_events = &power6_cache_events, -}; - -static int __init init_power6_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6")) - return -ENODEV; - - return register_power_pmu(&power6_pmu); -} - -early_initcall(init_power6_pmu); diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c deleted file mode 100644 index 1251e4d7e262..000000000000 --- a/arch/powerpc/kernel/power7-pmu.c +++ /dev/null @@ -1,379 +0,0 @@ -/* - * Performance counter support for POWER7 processors. - * - * Copyright 2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - -/* - * Bits in event code for POWER7 - */ -#define PM_PMC_SH 16 /* PMC number (1-based) for direct events */ -#define PM_PMC_MSK 0xf -#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) -#define PM_UNIT_SH 12 /* TTMMUX number and setting - unit select */ -#define PM_UNIT_MSK 0xf -#define PM_COMBINE_SH 11 /* Combined event bit */ -#define PM_COMBINE_MSK 1 -#define PM_COMBINE_MSKS 0x800 -#define PM_L2SEL_SH 8 /* L2 event select */ -#define PM_L2SEL_MSK 7 -#define PM_PMCSEL_MSK 0xff - -/* - * Bits in MMCR1 for POWER7 - */ -#define MMCR1_TTM0SEL_SH 60 -#define MMCR1_TTM1SEL_SH 56 -#define MMCR1_TTM2SEL_SH 52 -#define MMCR1_TTM3SEL_SH 48 -#define MMCR1_TTMSEL_MSK 0xf -#define MMCR1_L2SEL_SH 45 -#define MMCR1_L2SEL_MSK 7 -#define MMCR1_PMC1_COMBINE_SH 35 -#define MMCR1_PMC2_COMBINE_SH 34 -#define MMCR1_PMC3_COMBINE_SH 33 -#define MMCR1_PMC4_COMBINE_SH 32 -#define MMCR1_PMC1SEL_SH 24 -#define MMCR1_PMC2SEL_SH 16 -#define MMCR1_PMC3SEL_SH 8 -#define MMCR1_PMC4SEL_SH 0 -#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) -#define MMCR1_PMCSEL_MSK 0xff - -/* - * Layout of constraint bits: - * 6666555555555544444444443333333333222222222211111111110000000000 - * 3210987654321098765432109876543210987654321098765432109876543210 - * [ ><><><><><><> - * NC P6P5P4P3P2P1 - * - * NC - number of counters - * 15: NC error 0x8000 - * 12-14: number of events needing PMC1-4 0x7000 - * - * P6 - * 11: P6 error 0x800 - * 10-11: Count of events needing PMC6 - * - * P1..P5 - * 0-9: Count of events needing PMC1..PMC5 - */ - -static int power7_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, sh; - unsigned long mask = 0, value = 0; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 6) - return -1; - sh = (pmc - 1) * 2; - mask |= 2 << sh; - value |= 1 << sh; - if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4)) - return -1; - } - if (pmc < 5) { - /* need a counter from PMC1-4 set */ - mask |= 0x8000; - value |= 0x1000; - } - *maskp = mask; - *valp = value; - return 0; -} - -#define MAX_ALT 2 /* at most 2 alternatives for any event */ - -static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ - { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ - { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ -}; - -/* - * Scan the alternatives table for a match and return the - * index into the alternatives table if found, else -1. - */ -static int find_alternative(u64 event) -{ - int i, j; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - break; - for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) - if (event == event_alternatives[i][j]) - return i; - } - return -1; -} - -static s64 find_alternative_decode(u64 event) -{ - int pmc, psel; - - /* this only handles the 4x decode events */ - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - psel = event & PM_PMCSEL_MSK; - if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40) - return event - (1 << PM_PMC_SH) + 8; - if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48) - return event + (1 << PM_PMC_SH) - 8; - return -1; -} - -static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - int i, j, nalt = 1; - s64 ae; - - alt[0] = event; - nalt = 1; - i = find_alternative(event); - if (i >= 0) { - for (j = 0; j < MAX_ALT; ++j) { - ae = event_alternatives[i][j]; - if (ae && ae != event) - alt[nalt++] = ae; - } - } else { - ae = find_alternative_decode(event); - if (ae > 0) - alt[nalt++] = ae; - } - - if (flags & PPMU_ONLY_COUNT_RUN) { - /* - * We're only counting in RUN state, - * so PM_CYC is equivalent to PM_RUN_CYC - * and PM_INST_CMPL === PM_RUN_INST_CMPL. - * This doesn't include alternatives that don't provide - * any extra flexibility in assigning PMCs. - */ - j = nalt; - for (i = 0; i < nalt; ++i) { - switch (alt[i]) { - case 0x1e: /* PM_CYC */ - alt[j++] = 0x600f4; /* PM_RUN_CYC */ - break; - case 0x600f4: /* PM_RUN_CYC */ - alt[j++] = 0x1e; - break; - case 0x2: /* PM_PPC_CMPL */ - alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ - break; - case 0x500fa: /* PM_RUN_INST_CMPL */ - alt[j++] = 0x2; /* PM_PPC_CMPL */ - break; - } - } - nalt = j; - } - - return nalt; -} - -/* - * Returns 1 if event counts things relating to marked instructions - * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. - */ -static int power7_marked_instr_event(u64 event) -{ - int pmc, psel; - int unit; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - psel = event & PM_PMCSEL_MSK & ~1; /* trim off edge/level bit */ - if (pmc >= 5) - return 0; - - switch (psel >> 4) { - case 2: - return pmc == 2 || pmc == 4; - case 3: - if (psel == 0x3c) - return pmc == 1; - if (psel == 0x3e) - return pmc != 2; - return 1; - case 4: - case 5: - return unit == 0xd; - case 6: - if (psel == 0x64) - return pmc >= 3; - case 8: - return unit == 0xd; - } - return 0; -} - -static int power7_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - unsigned long mmcr1 = 0; - unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; - unsigned int pmc, unit, combine, l2sel, psel; - unsigned int pmc_inuse = 0; - int i; - - /* First pass to count resource use */ - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 6) - return -1; - if (pmc_inuse & (1 << (pmc - 1))) - return -1; - pmc_inuse |= 1 << (pmc - 1); - } - } - - /* Second pass: assign PMCs, set all MMCR1 fields */ - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK; - l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK; - psel = event[i] & PM_PMCSEL_MSK; - if (!pmc) { - /* Bus event or any-PMC direct event */ - for (pmc = 0; pmc < 4; ++pmc) { - if (!(pmc_inuse & (1 << pmc))) - break; - } - if (pmc >= 4) - return -1; - pmc_inuse |= 1 << pmc; - } else { - /* Direct or decoded event */ - --pmc; - } - if (pmc <= 3) { - mmcr1 |= (unsigned long) unit - << (MMCR1_TTM0SEL_SH - 4 * pmc); - mmcr1 |= (unsigned long) combine - << (MMCR1_PMC1_COMBINE_SH - pmc); - mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); - if (unit == 6) /* L2 events */ - mmcr1 |= (unsigned long) l2sel - << MMCR1_L2SEL_SH; - } - if (power7_marked_instr_event(event[i])) - mmcra |= MMCRA_SAMPLE_ENABLE; - hwc[i] = pmc; - } - - /* Return MMCRx values */ - mmcr[0] = 0; - if (pmc_inuse & 1) - mmcr[0] = MMCR0_PMC1CE; - if (pmc_inuse & 0x3e) - mmcr[0] |= MMCR0_PMCjCE; - mmcr[1] = mmcr1; - mmcr[2] = mmcra; - return 0; -} - -static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - if (pmc <= 3) - mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); -} - -static int power7_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x1e, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x100f8, /* GCT_NOSLOT_CYC */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x4000a, /* CMPLU_STALL */ - [PERF_COUNT_HW_INSTRUCTIONS] = 2, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU*/ - [PERF_COUNT_HW_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x10068, /* BRU_FIN */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0xc880, 0x400f0 }, - [C(OP_WRITE)] = { 0, 0x300f0 }, - [C(OP_PREFETCH)] = { 0xd8b8, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x200fc }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0x408a, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x16080, 0x26080 }, - [C(OP_WRITE)] = { 0x16082, 0x26082 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x300fc }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x400fc }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x10068, 0x400f6 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static struct power_pmu power7_pmu = { - .name = "POWER7", - .n_counter = 6, - .max_alternatives = MAX_ALT + 1, - .add_fields = 0x1555ul, - .test_adder = 0x3000ul, - .compute_mmcr = power7_compute_mmcr, - .get_constraint = power7_get_constraint, - .get_alternatives = power7_get_alternatives, - .disable_pmc = power7_disable_pmc, - .flags = PPMU_ALT_SIPR, - .n_generic = ARRAY_SIZE(power7_generic_events), - .generic_events = power7_generic_events, - .cache_events = &power7_cache_events, -}; - -static int __init init_power7_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7")) - return -ENODEV; - - return register_power_pmu(&power7_pmu); -} - -early_initcall(init_power7_pmu); diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c deleted file mode 100644 index 8c2190206964..000000000000 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ /dev/null @@ -1,502 +0,0 @@ -/* - * Performance counter support for PPC970-family processors. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include - -/* - * Bits in event code for PPC970 - */ -#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ -#define PM_PMC_MSK 0xf -#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ -#define PM_UNIT_MSK 0xf -#define PM_SPCSEL_SH 6 -#define PM_SPCSEL_MSK 3 -#define PM_BYTE_SH 4 /* Byte number of event bus to use */ -#define PM_BYTE_MSK 3 -#define PM_PMCSEL_MSK 0xf - -/* Values in PM_UNIT field */ -#define PM_NONE 0 -#define PM_FPU 1 -#define PM_VPU 2 -#define PM_ISU 3 -#define PM_IFU 4 -#define PM_IDU 5 -#define PM_STS 6 -#define PM_LSU0 7 -#define PM_LSU1U 8 -#define PM_LSU1L 9 -#define PM_LASTUNIT 9 - -/* - * Bits in MMCR0 for PPC970 - */ -#define MMCR0_PMC1SEL_SH 8 -#define MMCR0_PMC2SEL_SH 1 -#define MMCR_PMCSEL_MSK 0x1f - -/* - * Bits in MMCR1 for PPC970 - */ -#define MMCR1_TTM0SEL_SH 62 -#define MMCR1_TTM1SEL_SH 59 -#define MMCR1_TTM3SEL_SH 53 -#define MMCR1_TTMSEL_MSK 3 -#define MMCR1_TD_CP_DBG0SEL_SH 50 -#define MMCR1_TD_CP_DBG1SEL_SH 48 -#define MMCR1_TD_CP_DBG2SEL_SH 46 -#define MMCR1_TD_CP_DBG3SEL_SH 44 -#define MMCR1_PMC1_ADDER_SEL_SH 39 -#define MMCR1_PMC2_ADDER_SEL_SH 38 -#define MMCR1_PMC6_ADDER_SEL_SH 37 -#define MMCR1_PMC5_ADDER_SEL_SH 36 -#define MMCR1_PMC8_ADDER_SEL_SH 35 -#define MMCR1_PMC7_ADDER_SEL_SH 34 -#define MMCR1_PMC3_ADDER_SEL_SH 33 -#define MMCR1_PMC4_ADDER_SEL_SH 32 -#define MMCR1_PMC3SEL_SH 27 -#define MMCR1_PMC4SEL_SH 22 -#define MMCR1_PMC5SEL_SH 17 -#define MMCR1_PMC6SEL_SH 12 -#define MMCR1_PMC7SEL_SH 7 -#define MMCR1_PMC8SEL_SH 2 - -static short mmcr1_adder_bits[8] = { - MMCR1_PMC1_ADDER_SEL_SH, - MMCR1_PMC2_ADDER_SEL_SH, - MMCR1_PMC3_ADDER_SEL_SH, - MMCR1_PMC4_ADDER_SEL_SH, - MMCR1_PMC5_ADDER_SEL_SH, - MMCR1_PMC6_ADDER_SEL_SH, - MMCR1_PMC7_ADDER_SEL_SH, - MMCR1_PMC8_ADDER_SEL_SH -}; - -/* - * Layout of constraint bits: - * 6666555555555544444444443333333333222222222211111111110000000000 - * 3210987654321098765432109876543210987654321098765432109876543210 - * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><> - * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 - * - * SP - SPCSEL constraint - * 48-49: SPCSEL value 0x3_0000_0000_0000 - * - * T0 - TTM0 constraint - * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 - * - * T1 - TTM1 constraint - * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000 - * - * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS - * 43: UC3 error 0x0800_0000_0000 - * 42: FPU|IFU|VPU events needed 0x0400_0000_0000 - * 41: ISU events needed 0x0200_0000_0000 - * 40: IDU|STS events needed 0x0100_0000_0000 - * - * PS1 - * 39: PS1 error 0x0080_0000_0000 - * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 - * - * PS2 - * 35: PS2 error 0x0008_0000_0000 - * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 - * - * B0 - * 28-31: Byte 0 event source 0xf000_0000 - * Encoding as for the event code - * - * B1, B2, B3 - * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources - * - * P1 - * 15: P1 error 0x8000 - * 14-15: Count of events needing PMC1 - * - * P2..P8 - * 0-13: Count of events needing PMC2..PMC8 - */ - -static unsigned char direct_marked_event[8] = { - (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ - (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ - (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */ - (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ - (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */ - (1<<3) | (1<<4) | (1<<5), - /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ - (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ - (1<<4) /* PMC8: PM_MRK_LSU_FIN */ -}; - -/* - * Returns 1 if event counts things relating to marked instructions - * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. - */ -static int p970_marked_instr_event(u64 event) -{ - int pmc, psel, unit, byte, bit; - unsigned int mask; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - psel = event & PM_PMCSEL_MSK; - if (pmc) { - if (direct_marked_event[pmc - 1] & (1 << psel)) - return 1; - if (psel == 0) /* add events */ - bit = (pmc <= 4)? pmc - 1: 8 - pmc; - else if (psel == 7 || psel == 13) /* decode events */ - bit = 4; - else - return 0; - } else - bit = psel; - - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - mask = 0; - switch (unit) { - case PM_VPU: - mask = 0x4c; /* byte 0 bits 2,3,6 */ - break; - case PM_LSU0: - /* byte 2 bits 0,2,3,4,6; all of byte 1 */ - mask = 0x085dff00; - break; - case PM_LSU1L: - mask = 0x50 << 24; /* byte 3 bits 4,6 */ - break; - } - return (mask >> (byte * 8 + bit)) & 1; -} - -/* Masks and values for using events from the various units */ -static unsigned long unit_cons[PM_LASTUNIT+1][2] = { - [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, - [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull }, - [PM_ISU] = { 0x080000000000ull, 0x020000000000ull }, - [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull }, - [PM_IDU] = { 0x380000000000ull, 0x010000000000ull }, - [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, -}; - -static int p970_get_constraint(u64 event, unsigned long *maskp, - unsigned long *valp) -{ - int pmc, byte, unit, sh, spcsel; - unsigned long mask = 0, value = 0; - int grp = -1; - - pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc > 8) - return -1; - sh = (pmc - 1) * 2; - mask |= 2 << sh; - value |= 1 << sh; - grp = ((pmc - 1) >> 1) & 1; - } - unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; - if (unit) { - if (unit > PM_LASTUNIT) - return -1; - mask |= unit_cons[unit][0]; - value |= unit_cons[unit][1]; - byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - /* - * Bus events on bytes 0 and 2 can be counted - * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. - */ - if (!pmc) - grp = byte & 1; - /* Set byte lane select field */ - mask |= 0xfULL << (28 - 4 * byte); - value |= (unsigned long)unit << (28 - 4 * byte); - } - if (grp == 0) { - /* increment PMC1/2/5/6 field */ - mask |= 0x8000000000ull; - value |= 0x1000000000ull; - } else if (grp == 1) { - /* increment PMC3/4/7/8 field */ - mask |= 0x800000000ull; - value |= 0x100000000ull; - } - spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; - if (spcsel) { - mask |= 3ull << 48; - value |= (unsigned long)spcsel << 48; - } - *maskp = mask; - *valp = value; - return 0; -} - -static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -{ - alt[0] = event; - - /* 2 alternatives for LSU empty */ - if (event == 0x2002 || event == 0x3002) { - alt[1] = event ^ 0x1000; - return 2; - } - - return 1; -} - -static int p970_compute_mmcr(u64 event[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]) -{ - unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0; - unsigned int pmc, unit, byte, psel; - unsigned int ttm, grp; - unsigned int pmc_inuse = 0; - unsigned int pmc_grp_use[2]; - unsigned char busbyte[4]; - unsigned char unituse[16]; - unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 }; - unsigned char ttmuse[2]; - unsigned char pmcsel[8]; - int i; - int spcsel; - - if (n_ev > 8) - return -1; - - /* First pass to count resource use */ - pmc_grp_use[0] = pmc_grp_use[1] = 0; - memset(busbyte, 0, sizeof(busbyte)); - memset(unituse, 0, sizeof(unituse)); - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - if (pmc) { - if (pmc_inuse & (1 << (pmc - 1))) - return -1; - pmc_inuse |= 1 << (pmc - 1); - /* count 1/2/5/6 vs 3/4/7/8 use */ - ++pmc_grp_use[((pmc - 1) >> 1) & 1]; - } - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - if (unit) { - if (unit > PM_LASTUNIT) - return -1; - if (!pmc) - ++pmc_grp_use[byte & 1]; - if (busbyte[byte] && busbyte[byte] != unit) - return -1; - busbyte[byte] = unit; - unituse[unit] = 1; - } - } - if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) - return -1; - - /* - * Assign resources and set multiplexer selects. - * - * PM_ISU can go either on TTM0 or TTM1, but that's the only - * choice we have to deal with. - */ - if (unituse[PM_ISU] & - (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU])) - unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */ - /* Set TTM[01]SEL fields. */ - ttmuse[0] = ttmuse[1] = 0; - for (i = PM_FPU; i <= PM_STS; ++i) { - if (!unituse[i]) - continue; - ttm = unitmap[i]; - ++ttmuse[(ttm >> 2) & 1]; - mmcr1 |= (unsigned long)(ttm & ~4) << MMCR1_TTM1SEL_SH; - } - /* Check only one unit per TTMx */ - if (ttmuse[0] > 1 || ttmuse[1] > 1) - return -1; - - /* Set byte lane select fields and TTM3SEL. */ - for (byte = 0; byte < 4; ++byte) { - unit = busbyte[byte]; - if (!unit) - continue; - if (unit <= PM_STS) - ttm = (unitmap[unit] >> 2) & 1; - else if (unit == PM_LSU0) - ttm = 2; - else { - ttm = 3; - if (unit == PM_LSU1L && byte >= 2) - mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); - } - mmcr1 |= (unsigned long)ttm - << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); - } - - /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ - memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */ - for (i = 0; i < n_ev; ++i) { - pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; - unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; - byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; - psel = event[i] & PM_PMCSEL_MSK; - if (!pmc) { - /* Bus event or any-PMC direct event */ - if (unit) - psel |= 0x10 | ((byte & 2) << 2); - else - psel |= 8; - for (pmc = 0; pmc < 8; ++pmc) { - if (pmc_inuse & (1 << pmc)) - continue; - grp = (pmc >> 1) & 1; - if (unit) { - if (grp == (byte & 1)) - break; - } else if (pmc_grp_use[grp] < 4) { - ++pmc_grp_use[grp]; - break; - } - } - pmc_inuse |= 1 << pmc; - } else { - /* Direct event */ - --pmc; - if (psel == 0 && (byte & 2)) - /* add events on higher-numbered bus */ - mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; - } - pmcsel[pmc] = psel; - hwc[i] = pmc; - spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; - mmcr1 |= spcsel; - if (p970_marked_instr_event(event[i])) - mmcra |= MMCRA_SAMPLE_ENABLE; - } - for (pmc = 0; pmc < 2; ++pmc) - mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); - for (; pmc < 8; ++pmc) - mmcr1 |= (unsigned long)pmcsel[pmc] - << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); - if (pmc_inuse & 1) - mmcr0 |= MMCR0_PMC1CE; - if (pmc_inuse & 0xfe) - mmcr0 |= MMCR0_PMCjCE; - - mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ - - /* Return MMCRx values */ - mmcr[0] = mmcr0; - mmcr[1] = mmcr1; - mmcr[2] = mmcra; - return 0; -} - -static void p970_disable_pmc(unsigned int pmc, unsigned long mmcr[]) -{ - int shift, i; - - if (pmc <= 1) { - shift = MMCR0_PMC1SEL_SH - 7 * pmc; - i = 0; - } else { - shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2); - i = 1; - } - /* - * Setting the PMCxSEL field to 0x08 disables PMC x. - */ - mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift); -} - -static int ppc970_generic_events[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 7, - [PERF_COUNT_HW_INSTRUCTIONS] = 1, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ - [PERF_COUNT_HW_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x - -/* - * Table of generalized cache-related events. - * 0 means not supported, -1 means nonsensical, other values - * are event codes. - */ -static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x8810, 0x3810 }, - [C(OP_WRITE)] = { 0x7810, 0x813 }, - [C(OP_PREFETCH)] = { 0x731, 0 }, - }, - [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { 0, 0 }, - }, - [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0 }, - [C(OP_WRITE)] = { 0, 0 }, - [C(OP_PREFETCH)] = { 0x733, 0 }, - }, - [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x704 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0, 0x700 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x431, 0x327 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, - [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { -1, -1 }, - [C(OP_WRITE)] = { -1, -1 }, - [C(OP_PREFETCH)] = { -1, -1 }, - }, -}; - -static struct power_pmu ppc970_pmu = { - .name = "PPC970/FX/MP", - .n_counter = 8, - .max_alternatives = 2, - .add_fields = 0x001100005555ull, - .test_adder = 0x013300000000ull, - .compute_mmcr = p970_compute_mmcr, - .get_constraint = p970_get_constraint, - .get_alternatives = p970_get_alternatives, - .disable_pmc = p970_disable_pmc, - .n_generic = ARRAY_SIZE(ppc970_generic_events), - .generic_events = ppc970_generic_events, - .cache_events = &ppc970_cache_events, -}; - -static int __init init_ppc970_pmu(void) -{ - if (!cur_cpu_spec->oprofile_cpu_type || - (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970") - && strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970MP"))) - return -ENODEV; - - return register_power_pmu(&ppc970_pmu); -} - -early_initcall(init_ppc970_pmu); -- cgit v1.2.2 From a2007ce8447c9a71d9d694ddcdf64f9dbbf022ff Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 14 Feb 2012 01:40:59 +0000 Subject: powerpc: Use set_current_blocked() and block_sigmask() As described in e6fa16ab ("signal: sigprocmask() should do retarget_shared_pending()") the modification of current->blocked is incorrect as we need to check whether the signal we're about to block is pending in the shared queue. Also, use the new helper function introduced in commit 5e6292c0f28f ("signal: add block_sigmask() for adding sigmask to current->blocked") which centralises the code for updating current->blocked after successfully delivering a signal and reduces the amount of duplicate code across architectures. In the past some architectures got this code wrong, so using this helper function should stop that from happening again. Cc: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Matt Fleming Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/signal.c | 13 ++----------- arch/powerpc/kernel/signal_32.c | 11 ++++++----- 2 files changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index ac6e437b1021..7006b7f4267a 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -57,10 +57,7 @@ void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, void restore_sigmask(sigset_t *set) { sigdelsetmask(set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = *set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(set); } static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, @@ -169,13 +166,7 @@ static int do_signal(struct pt_regs *regs) regs->trap = 0; if (ret) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, - &ka.sa.sa_mask); - if (!(ka.sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, signr); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + block_sigmask(&ka, signr); /* * A signal was successfully delivered; the saved sigmask is in diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 836a5a19eb2c..e061ef5dd449 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -242,12 +242,13 @@ static inline int restore_general_regs(struct pt_regs *regs, */ long sys_sigsuspend(old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); + sigset_t blocked; + current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + + mask &= _BLOCKABLE; + siginitset(&blocked, mask); + set_current_blocked(&blocked); current->state = TASK_INTERRUPTIBLE; schedule(); -- cgit v1.2.2 From e9daf2ad7f603f173d7cd7ee3673b326414f82f4 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Mon, 27 Feb 2012 08:55:15 +0000 Subject: powerpc/prom: Remove limit on maximum size of properties On a 16TB system (using AMS/CMO), I get: WARNING: ignoring large property [/ibm,dynamic-reconfiguration-memory] ibm,dynamic-memory length 0x000000000017ffec and significantly less memory is thus shown to the partition. As far as I can tell, the constant used is arbitrary. Ben Herrenschmidt provided additional background that > The limit was originally set because of Apple machines carrying ROM > images in the device-tree, at a time where we were much more memory > constrained than we are now. and that it is likely not very useful any longer. Signed-off-by: Nishanth Aravamudan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom_init.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index eca626ea3f23..e2d599048142 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -47,14 +47,6 @@ #include -/* - * Properties whose value is longer than this get excluded from our - * copy of the device tree. This value does need to be big enough to - * ensure that we don't lose things like the interrupt-map property - * on a PCI-PCI bridge. - */ -#define MAX_PROPERTY_LENGTH (1UL * 1024 * 1024) - /* * Eventually bump that one up */ @@ -2273,13 +2265,6 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* sanity checks */ if (l == PROM_ERROR) continue; - if (l > MAX_PROPERTY_LENGTH) { - prom_printf("WARNING: ignoring large property "); - /* It seems OF doesn't null-terminate the path :-( */ - prom_printf("[%s] ", path); - prom_printf("%s length 0x%x\n", RELOC(pname), l); - continue; - } /* push property head */ dt_push_token(OF_DT_PROP, mem_start, mem_end); -- cgit v1.2.2 From ad5b7f1350c263eef0c99c20f8659d0ed363cb32 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 30 Jan 2012 08:02:19 +0000 Subject: powerpc: Make SPARSE_IRQ required All IRQs on powerpc are managed via irq_domain anyway, there isn't really any advantage to turning SPARSE_IRQ off, and it's the direction we want to take the kernel design anyway. This patch makes powerpc always use SPARSE_IRQ. On pseries_defconfig, SPARSE_IRQ adds only about 0x300 bytes to the .text sections, and removes about 0x20000 from the data section for the static irq_desc table. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Ben Herrenschmidt Cc: Thomas Gleixner Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/irq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 01e2877e8e04..9b6e80668cfb 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -93,10 +93,6 @@ extern int tau_interrupts(int); #ifdef CONFIG_PPC64 -#ifndef CONFIG_SPARSE_IRQ -EXPORT_SYMBOL(irq_desc); -#endif - int distribute_irqs = 1; static inline notrace unsigned long get_hard_enabled(void) -- cgit v1.2.2 From b0787660260604ba63621881851de0032279819b Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 7 Mar 2012 18:43:10 +0000 Subject: powerpc: clean up vio.c This cleans up vio.c after the removal of the legacy iSeries platform. It also removes some no longer referenced include files. Signed-off-by: Stephen Rothwell Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/vio.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 8b086299ba25..bca3fc427b45 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -34,11 +34,6 @@ #include #include #include -#include -#include -#include -#include -#include static struct bus_type vio_bus_type; @@ -1042,7 +1037,6 @@ static void vio_cmo_sysfs_init(void) vio_bus_type.bus_attrs = vio_cmo_bus_attrs; } #else /* CONFIG_PPC_SMLPAR */ -/* Dummy functions for iSeries platform */ int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; } void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {} static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; } @@ -1060,9 +1054,6 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) struct iommu_table *tbl; unsigned long offset, size; - if (firmware_has_feature(FW_FEATURE_ISERIES)) - return vio_build_iommu_table_iseries(dev); - dma_window = of_get_property(dev->dev.of_node, "ibm,my-dma-window", NULL); if (!dma_window) @@ -1195,8 +1186,7 @@ static void __devinit vio_dev_release(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); - /* iSeries uses a common table for all vio devices */ - if (!firmware_has_feature(FW_FEATURE_ISERIES) && tbl) + if (tbl) iommu_free_table(tbl, dev->of_node ? dev->of_node->full_name : dev_name(dev)); of_node_put(dev->of_node); @@ -1244,12 +1234,6 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) viodev->name = of_node->name; viodev->type = of_node->type; viodev->unit_address = *unit_address; - if (firmware_has_feature(FW_FEATURE_ISERIES)) { - unit_address = of_get_property(of_node, - "linux,unit_address", NULL); - if (unit_address != NULL) - viodev->unit_address = *unit_address; - } viodev->dev.of_node = of_node_get(of_node); if (firmware_has_feature(FW_FEATURE_CMO)) -- cgit v1.2.2 From 4f8cf36f48b4648a5231e9fc8e49faea377246f4 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Feb 2012 13:44:58 +1100 Subject: powerpc: Remove legacy iSeries bits from assembly files This removes the various bits of assembly in the kernel entry, exception handling and SLB management code that were specific to running under the legacy iSeries hypervisor which is no longer supported. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/entry_64.S | 42 +--------------- arch/powerpc/kernel/exceptions-64s.S | 95 +++--------------------------------- arch/powerpc/kernel/head_64.S | 44 +++-------------- arch/powerpc/kernel/misc.S | 1 - arch/powerpc/kernel/vmlinux.lds.S | 5 -- 5 files changed, 15 insertions(+), 172 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 866462cbe2d8..0c3764ba8d49 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -127,17 +127,6 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) stb r10,PACASOFTIRQEN(r13) stb r10,PACAHARDIRQEN(r13) std r10,SOFTE(r1) -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - /* Hack for handling interrupts when soft-enabling on iSeries */ - cmpdi cr1,r0,0x5555 /* syscall 0x5555 */ - andi. r10,r12,MSR_PR /* from kernel */ - crand 4*cr0+eq,4*cr1+eq,4*cr0+eq - bne 2f - b hardware_interrupt_entry -2: -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif /* CONFIG_PPC_ISERIES */ /* Hard enable interrupts */ #ifdef CONFIG_PPC_BOOK3E @@ -591,15 +580,10 @@ _GLOBAL(ret_from_except_lite) ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_USER_WORK_MASK bne do_work -#endif +#endif /* !CONFIG_PREEMPT */ restore: -BEGIN_FW_FTR_SECTION ld r5,SOFTE(r1) -FW_FTR_SECTION_ELSE - b .Liseries_check_pending_irqs -ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) -2: TRACE_AND_RESTORE_IRQ(r5); /* extract EE bit and use it to restore paca->hard_enabled */ @@ -669,30 +653,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) #endif /* CONFIG_PPC_BOOK3E */ -.Liseries_check_pending_irqs: -#ifdef CONFIG_PPC_ISERIES - ld r5,SOFTE(r1) - cmpdi 0,r5,0 - beq 2b - /* Check for pending interrupts (iSeries) */ - ld r3,PACALPPACAPTR(r13) - ld r3,LPPACAANYINT(r3) - cmpdi r3,0 - beq+ 2b /* skip do_IRQ if no interrupts */ - - li r3,0 - stb r3,PACASOFTIRQEN(r13) /* ensure we are soft-disabled */ -#ifdef CONFIG_TRACE_IRQFLAGS - bl .trace_hardirqs_off - mfmsr r10 -#endif - ori r10,r10,MSR_EE - mtmsrd r10 /* hard-enable again */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_IRQ - b .ret_from_except_lite /* loop back and handle more */ -#endif - do_work: #ifdef CONFIG_PREEMPT andi. r0,r3,MSR_PR /* Returning to user mode? */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 15c5a4f6de01..fea8a69df4b2 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -19,7 +19,7 @@ * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x2fff : pSeries Interrupt prologs - * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs + * 0x3000 - 0x5fff : interrupt support common interrupt prologs * 0x6000 - 0x6fff : Initial (CPU0) segment table * 0x7000 - 0x7fff : FWNMI data area * 0x8000 - : Early init and support code @@ -458,6 +458,7 @@ machine_check_common: bl .machine_check_exception b .ret_from_except + STD_EXCEPTION_COMMON_LITE(0x500, hardware_interrupt, do_IRQ) STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt) STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception) STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception) @@ -672,12 +673,6 @@ _GLOBAL(slb_miss_realmode) ld r10,PACA_EXSLB+EX_LR(r13) ld r3,PACA_EXSLB+EX_R3(r13) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - ld r11,PACALPPACAPTR(r13) - ld r11,LPPACASRR0(r11) /* get SRR0 value */ -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif /* CONFIG_PPC_ISERIES */ mtlr r10 @@ -690,12 +685,6 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif /* CONFIG_PPC_ISERIES */ ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -704,13 +693,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) rfid b . /* prevent speculative execution */ -2: -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - b unrecov_slb -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif /* CONFIG_PPC_ISERIES */ - mfspr r11,SPRN_SRR0 +2: mfspr r11,SPRN_SRR0 ld r10,PACAKBASE(r13) LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 @@ -727,20 +710,6 @@ unrecov_slb: bl .unrecoverable_exception b 1b - .align 7 - .globl hardware_interrupt_common - .globl hardware_interrupt_entry -hardware_interrupt_common: - EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN) - FINISH_NAP -hardware_interrupt_entry: - DISABLE_INTS -BEGIN_FTR_SECTION - bl .ppc64_runlatch_on -END_FTR_SECTION_IFSET(CPU_FTR_CTRL) - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_IRQ - b .ret_from_except_lite #ifdef CONFIG_PPC_970_NAP power4_fixup_nap: @@ -913,11 +882,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ bne 77f /* then don't call hash_page now */ - /* - * On iSeries, we soft-disable interrupts here, then - * hard-enable interrupts so that the hash_page code can spin on - * the hash_table_lock without problems on a shared processor. - */ + /* We run with interrupts both soft and hard disabled */ DISABLE_INTS /* @@ -956,25 +921,11 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) bl .hash_page /* build HPTE if possible */ cmpdi r3,0 /* see if hash_page succeeded */ -BEGIN_FW_FTR_SECTION - /* - * If we had interrupts soft-enabled at the point where the - * DSI/ISI occurred, and an interrupt came in during hash_page, - * handle it now. - * We jump to ret_from_except_lite rather than fast_exception_return - * because ret_from_except_lite will check for and handle pending - * interrupts if necessary. - */ - beq 13f -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) - -BEGIN_FW_FTR_SECTION /* * Here we have interrupts hard-disabled, so it is sufficient * to restore paca->{soft,hard}_enable and get out. */ beq fast_exc_return_irq /* Return from exception on success */ -END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES) /* For a hash failure, we don't bother re-enabling interrupts */ ble- 12f @@ -1141,51 +1092,19 @@ _GLOBAL(do_stab_bolted) .= 0x7000 .globl fwnmi_data_area fwnmi_data_area: -#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ - - /* iSeries does not use the FWNMI stuff, so it is safe to put - * this here, even if we later allow kernels that will boot on - * both pSeries and iSeries */ -#ifdef CONFIG_PPC_ISERIES - . = LPARMAP_PHYS - .globl xLparMap -xLparMap: - .quad HvEsidsToMap /* xNumberEsids */ - .quad HvRangesToMap /* xNumberRanges */ - .quad STAB0_PAGE /* xSegmentTableOffs */ - .zero 40 /* xRsvd */ - /* xEsids (HvEsidsToMap entries of 2 quads) */ - .quad PAGE_OFFSET_ESID /* xKernelEsid */ - .quad PAGE_OFFSET_VSID /* xKernelVsid */ - .quad VMALLOC_START_ESID /* xKernelEsid */ - .quad VMALLOC_START_VSID /* xKernelVsid */ - /* xRanges (HvRangesToMap entries of 3 quads) */ - .quad HvPagesToMap /* xPages */ - .quad 0 /* xOffset */ - .quad PAGE_OFFSET_VSID << (SID_SHIFT - HW_PAGE_SHIFT) /* xVPN */ - -#endif /* CONFIG_PPC_ISERIES */ -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* pseries and powernv need to keep the whole page from * 0x7000 to 0x8000 free for use by the firmware */ . = 0x8000 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ -/* - * Space for CPU0's segment table. - * - * On iSeries, the hypervisor must fill in at least one entry before - * we get control (with relocate on). The address is given to the hv - * as a page number (see xLparMap above), so this must be at a - * fixed address (the linker can't compute (u64)&initial_stab >> - * PAGE_SHIFT). - */ - . = STAB0_OFFSET /* 0x8000 */ +/* Space for CPU0's segment table */ + .balign 4096 .globl initial_stab initial_stab: .space 4096 + #ifdef CONFIG_PPC_POWERNV _GLOBAL(opal_mc_secondary_handler) HMT_MEDIUM diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 06c7251c1bf7..40759fbfb171 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -57,10 +56,6 @@ * entry in r9 for debugging purposes * 2. Secondary processors enter at 0x60 with PIR in gpr3 * - * For iSeries: - * 1. The MMU is on (as it always is for iSeries) - * 2. The kernel is entered at system_reset_iSeries - * * For Book3E processors: * 1. The MMU is on running in AS0 in a state defined in ePAPR * 2. The kernel is entered at __start @@ -93,15 +88,6 @@ __secondary_hold_spinloop: __secondary_hold_acknowledge: .llong 0x0 -#ifdef CONFIG_PPC_ISERIES - /* - * At offset 0x20, there is a pointer to iSeries LPAR data. - * This is required by the hypervisor - */ - . = 0x20 - .llong hvReleaseData-KERNELBASE -#endif /* CONFIG_PPC_ISERIES */ - #ifdef CONFIG_RELOCATABLE /* This flag is set to 1 by a loader if the kernel should run * at the loaded address instead of the linked address. This @@ -582,7 +568,7 @@ _GLOBAL(pmac_secondary_start) * 1. Processor number * 2. Segment table pointer (virtual address) * On entry the following are set: - * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries + * r1 = stack pointer (real addr of temp stack) * r24 = cpu# (in Linux terms) * r13 = paca virtual address * SPRG_PACA = paca virtual address @@ -595,7 +581,7 @@ __secondary_start: /* Set thread priority to MEDIUM */ HMT_MEDIUM - /* Initialize the kernel stack. Just a repeat for iSeries. */ + /* Initialize the kernel stack */ LOAD_REG_ADDR(r3, current_set) sldi r28,r24,3 /* get current_set[cpu#] */ ldx r14,r3,r28 @@ -615,20 +601,13 @@ __secondary_start: li r7,0 mtlr r7 + /* Mark interrupts both hard and soft disabled */ + stb r7,PACAHARDIRQEN(r13) + stb r7,PACASOFTIRQEN(r13) + /* enable MMU and jump to start_secondary */ LOAD_REG_ADDR(r3, .start_secondary_prolog) LOAD_REG_IMMEDIATE(r4, MSR_KERNEL) -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - ori r4,r4,MSR_EE - li r8,1 - stb r8,PACAHARDIRQEN(r13) -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif -BEGIN_FW_FTR_SECTION - stb r7,PACAHARDIRQEN(r13) -END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES) - stb r7,PACASOFTIRQEN(r13) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 @@ -774,17 +753,8 @@ _INIT_GLOBAL(start_here_common) bl .setup_system /* Load up the kernel context */ -5: - li r5,0 +5: li r5,0 stb r5,PACASOFTIRQEN(r13) /* Soft Disabled */ -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - mfmsr r5 - ori r5,r5,MSR_EE /* Hard Enabled on iSeries*/ - mtmsrd r5 - li r5,1 -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif stb r5,PACAHARDIRQEN(r13) /* Hard Disabled on others */ bl .start_kernel diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index b69463ec2010..ba16874fe294 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -5,7 +5,6 @@ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) * and Paul Mackerras. * - * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com) * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) * * setjmp/longjmp code by Paul Mackerras. diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 710a54005dfb..65d1c08cf09e 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -109,11 +109,6 @@ SECTIONS __ptov_table_begin = .; *(.ptov_fixup); __ptov_table_end = .; -#ifdef CONFIG_PPC_ISERIES - __dt_strings_start = .; - *(.dt_strings); - __dt_strings_end = .; -#endif } .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { -- cgit v1.2.2 From 7450f6f03e9d6dc95d2014c4cceac8adf98560e8 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 1 Mar 2012 10:52:01 +1100 Subject: powerpc: Use the same interrupt prolog for perfmon as other interrupts The perfmon interrupt is the sole user of a special variant of the interrupt prolog which differs from the one used by external and timer interrupts in that it saves the non-volatile GPRs and doesn't turn the runlatch on. The former is unnecessary and the later is arguably incorrect, so let's clean that up by using the same prolog. While at it we rename that prolog to use the _ASYNC prefix. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fea8a69df4b2..2240d4ecec02 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -458,15 +458,15 @@ machine_check_common: bl .machine_check_exception b .ret_from_except - STD_EXCEPTION_COMMON_LITE(0x500, hardware_interrupt, do_IRQ) - STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt) + STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ) + STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, .timer_interrupt) STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception) STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception) STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception) STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception) STD_EXCEPTION_COMMON(0xe40, emulation_assist, .program_check_exception) STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception) - STD_EXCEPTION_COMMON_IDLE(0xf00, performance_monitor, .performance_monitor_exception) + STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, .performance_monitor_exception) STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception) #ifdef CONFIG_ALTIVEC STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception) -- cgit v1.2.2 From fe1952fc0afb9a2e4c79f103c08aef5d13db1873 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 1 Mar 2012 12:45:27 +1100 Subject: powerpc: Rework runlatch code This moves the inlines into system.h and changes the runlatch code to use the thread local flags (non-atomic) rather than the TIF flags (atomic) to keep track of the latch state. The code to turn it back on in an asynchronous interrupt is now simplified and partially inlined. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 3 +++ arch/powerpc/kernel/process.c | 24 +++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2240d4ecec02..3af80e82830b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -483,6 +483,9 @@ machine_check_common: system_call_entry: b system_call_common +ppc64_runlatch_on_trampoline: + b .__ppc64_runlatch_on + /* * Here we have detected that the kernel stack pointer is bad. * R9 contains the saved CR, r13 points to the paca, diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index d817ab018486..bf80a1d5f8fe 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1220,34 +1220,32 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); #ifdef CONFIG_PPC64 -void ppc64_runlatch_on(void) +/* Called with hard IRQs off */ +void __ppc64_runlatch_on(void) { + struct thread_info *ti = current_thread_info(); unsigned long ctrl; - if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_flag(TIF_RUNLATCH)) { - HMT_medium(); - - ctrl = mfspr(SPRN_CTRLF); - ctrl |= CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + ctrl = mfspr(SPRN_CTRLF); + ctrl |= CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); - set_thread_flag(TIF_RUNLATCH); - } + ti->local_flags |= TLF_RUNLATCH; } +/* Called with hard IRQs off */ void __ppc64_runlatch_off(void) { + struct thread_info *ti = current_thread_info(); unsigned long ctrl; - HMT_medium(); - - clear_thread_flag(TIF_RUNLATCH); + ti->local_flags &= ~TLF_RUNLATCH; ctrl = mfspr(SPRN_CTRLF); ctrl &= ~CTRL_RUNLATCH; mtspr(SPRN_CTRLT, ctrl); } -#endif +#endif /* CONFIG_PPC64 */ #if THREAD_SHIFT < PAGE_SHIFT -- cgit v1.2.2 From 1421ae0b29e0003395613bf67610d15fb7047e09 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 1 Mar 2012 15:40:23 +1100 Subject: powerpc: Improve 64-bit syscall entry/exit We unconditionally hard enable interrupts. This is unnecessary as syscalls are expected to always be called with interrupts enabled. While at it, we add a WARN_ON if that is not the case and CONFIG_TRACE_IRQFLAGS is enabled (we don't want to add overhead to the fast path when this is not set though). Thus let's remove the enabling (and associated irq tracing) from the syscall entry path. Also on Book3S, replace a few mfmsr instructions with loads of PACAMSR from the PACA, which should be faster & schedule better. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/entry_64.S | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 0c3764ba8d49..cc030b73174b 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -115,28 +115,33 @@ BEGIN_FW_FTR_SECTION END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) #endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */ -#ifdef CONFIG_TRACE_IRQFLAGS - bl .trace_hardirqs_on - REST_GPR(0,r1) - REST_4GPRS(3,r1) - REST_2GPRS(7,r1) - addi r9,r1,STACK_FRAME_OVERHEAD - ld r12,_MSR(r1) -#endif /* CONFIG_TRACE_IRQFLAGS */ - li r10,1 - stb r10,PACASOFTIRQEN(r13) - stb r10,PACAHARDIRQEN(r13) - std r10,SOFTE(r1) + /* + * A syscall should always be called with interrupts enabled + * so we just unconditionally hard-enable here. When some kind + * of irq tracing is used, we additionally check that condition + * is correct + */ +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_BUG) + lbz r10,PACASOFTIRQEN(r13) + xori r10,r10,1 +1: tdnei r10,0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING +#endif - /* Hard enable interrupts */ #ifdef CONFIG_PPC_BOOK3E wrteei 1 #else - mfmsr r11 + ld r11,PACAKMSR(r13) ori r11,r11,MSR_EE mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ + /* We do need to set SOFTE in the stack frame or the return + * from interrupt will be painful + */ + li r10,1 + std r10,SOFTE(r1) + #ifdef SHOW_SYSCALLS bl .do_show_syscall REST_GPR(0,r1) @@ -187,16 +192,14 @@ syscall_exit: andi. r10,r8,MSR_RI beq- unrecov_restore #endif - - /* Disable interrupts so current_thread_info()->flags can't change, + /* + * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. */ #ifdef CONFIG_PPC_BOOK3E wrteei 0 #else - mfmsr r10 - rldicl r10,r10,48,1 - rotldi r10,r10,16 + ld r10,PACAKMSR(r13) mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ @@ -308,7 +311,7 @@ syscall_exit_work: #ifdef CONFIG_PPC_BOOK3E wrteei 1 #else - mfmsr r10 + ld r10,PACAKMSR(r13) ori r10,r10,MSR_EE mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ -- cgit v1.2.2 From a546498f3bf9aac311c66f965186373aee2ca0b0 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Mar 2012 16:48:45 +1100 Subject: powerpc: Call do_page_fault() with interrupts off We currently turn interrupts back to their previous state before calling do_page_fault(). This can be annoying when debugging as a bad fault will potentially have lost some processor state before getting into the debugger. We also end up calling some generic code with interrupts enabled such as notify_page_fault() with interrupts enabled, which could be unexpected. This changes our code to behave more like other architectures, and make the assembly entry code call into do_page_faults() with interrupts disabled. They are conditionally re-enabled from within do_page_fault() in the same spot x86 does it. While there, add the might_sleep() test in the case of a successful trylock of the mmap semaphore, again like x86. Also fix a bug in the existing assembly where r12 (_MSR) could get clobbered by C calls (the DTL accounting in the exception common macro and DISABLE_INTS) in some cases. Signed-off-by: Benjamin Herrenschmidt --- v2. Add the r12 clobber fix --- arch/powerpc/kernel/exceptions-64e.S | 5 ++- arch/powerpc/kernel/exceptions-64s.S | 59 +++++++++++------------------------- arch/powerpc/kernel/head_32.S | 4 +-- arch/powerpc/kernel/head_40x.S | 4 +-- arch/powerpc/kernel/head_8xx.S | 4 +-- arch/powerpc/kernel/head_booke.h | 4 +-- arch/powerpc/kernel/head_fsl_booke.S | 2 +- 7 files changed, 28 insertions(+), 54 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 429983c06f91..573613d747ac 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -313,7 +313,7 @@ interrupt_end_book3e: NORMAL_EXCEPTION_PROLOG(0x300, PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_DISABLE_ALL) b storage_fault_common /* Instruction Storage Interrupt */ @@ -321,7 +321,7 @@ interrupt_end_book3e: NORMAL_EXCEPTION_PROLOG(0x400, PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_DISABLE_ALL) b storage_fault_common /* External Input Interrupt */ @@ -591,7 +591,6 @@ storage_fault_common: mr r5,r15 ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) - INTS_RESTORE_HARD bl .do_page_fault cmpdi r3,0 bne- 1f diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3af80e82830b..d8ff6d37fc4d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -559,6 +559,8 @@ data_access_common: mfspr r10,SPRN_DSISR stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) + DISABLE_INTS + ld r12,_MSR(r1) ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) li r5,0x300 @@ -573,6 +575,7 @@ h_data_storage_common: stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN) bl .save_nvgprs + DISABLE_INTS addi r3,r1,STACK_FRAME_OVERHEAD bl .unknown_exception b .ret_from_except @@ -581,6 +584,8 @@ h_data_storage_common: .globl instruction_access_common instruction_access_common: EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN) + DISABLE_INTS + ld r12,_MSR(r1) ld r3,_NIP(r1) andis. r4,r12,0x5820 li r5,0x400 @@ -884,24 +889,6 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ bne 77f /* then don't call hash_page now */ - - /* We run with interrupts both soft and hard disabled */ - DISABLE_INTS - - /* - * Currently, trace_hardirqs_off() will be called by DISABLE_INTS - * and will clobber volatile registers when irq tracing is enabled - * so we need to reload them. It may be possible to be smarter here - * and move the irq tracing elsewhere but let's keep it simple for - * now - */ -#ifdef CONFIG_TRACE_IRQFLAGS - ld r3,_DAR(r1) - ld r4,_DSISR(r1) - ld r5,_TRAP(r1) - ld r12,_MSR(r1) - clrrdi r5,r5,4 -#endif /* CONFIG_TRACE_IRQFLAGS */ /* * We need to set the _PAGE_USER bit if MSR_PR is set or if we are * accessing a userspace segment (even from the kernel). We assume @@ -931,36 +918,16 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) beq fast_exc_return_irq /* Return from exception on success */ /* For a hash failure, we don't bother re-enabling interrupts */ - ble- 12f - - /* - * hash_page couldn't handle it, set soft interrupt enable back - * to what it was before the trap. Note that .arch_local_irq_restore - * handles any interrupts pending at this point. - */ - ld r3,SOFTE(r1) - TRACE_AND_RESTORE_IRQ_PARTIAL(r3, 11f) - bl .arch_local_irq_restore - b 11f - -/* We have a data breakpoint exception - handle it */ -handle_dabr_fault: - bl .save_nvgprs - ld r4,_DAR(r1) - ld r5,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_dabr - b .ret_from_except_lite + ble- 13f /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: - ENABLE_INTS 11: ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl .do_page_fault cmpdi r3,0 - beq+ 13f + beq+ 12f bl .save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD @@ -968,12 +935,20 @@ handle_page_fault: bl .bad_page_fault b .ret_from_except -13: b .ret_from_except_lite +/* We have a data breakpoint exception - handle it */ +handle_dabr_fault: + bl .save_nvgprs + ld r4,_DAR(r1) + ld r5,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl .do_dabr +12: b .ret_from_except_lite + /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ -12: bl .save_nvgprs +13: bl .save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD ld r4,_DAR(r1) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 0654dba2c1f1..dc0488b6f6e1 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -395,7 +395,7 @@ DataAccess: bl hash_page 1: lwz r5,_DSISR(r11) /* get DSISR value */ mfspr r4,SPRN_DAR - EXC_XFER_EE_LITE(0x300, handle_page_fault) + EXC_XFER_LITE(0x300, handle_page_fault) /* Instruction access exception. */ @@ -410,7 +410,7 @@ InstructionAccess: bl hash_page 1: mr r4,r12 mr r5,r9 - EXC_XFER_EE_LITE(0x400, handle_page_fault) + EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 872a6af83bad..4989661b710b 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -394,7 +394,7 @@ label: NORMAL_EXCEPTION_PROLOG mr r4,r12 /* Pass SRR0 as arg2 */ li r5,0 /* Pass zero as arg3 */ - EXC_XFER_EE_LITE(0x400, handle_page_fault) + EXC_XFER_LITE(0x400, handle_page_fault) /* 0x0500 - External Interrupt Exception */ EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) @@ -747,7 +747,7 @@ DataAccess: mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ - EXC_XFER_EE_LITE(0x300, handle_page_fault) + EXC_XFER_LITE(0x300, handle_page_fault) /* Other PowerPC processors, namely those derived from the 6xx-series * have vectors from 0x2100 through 0x2F00 defined, but marked as reserved. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index b68cb173ba2c..b2a5860accfb 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -220,7 +220,7 @@ DataAccess: mfspr r4,SPRN_DAR li r10,0x00f0 mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ - EXC_XFER_EE_LITE(0x300, handle_page_fault) + EXC_XFER_LITE(0x300, handle_page_fault) /* Instruction access exception. * This is "never generated" by the MPC8xx. We jump to it for other @@ -231,7 +231,7 @@ InstructionAccess: EXCEPTION_PROLOG mr r4,r12 mr r5,r9 - EXC_XFER_EE_LITE(0x400, handle_page_fault) + EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index fc921bf62e15..0e4175388f47 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -359,7 +359,7 @@ label: mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ - EXC_XFER_EE_LITE(0x0300, handle_page_fault) + EXC_XFER_LITE(0x0300, handle_page_fault) #define INSTRUCTION_STORAGE_EXCEPTION \ START_EXCEPTION(InstructionStorage) \ @@ -368,7 +368,7 @@ label: stw r5,_ESR(r11); \ mr r4,r12; /* Pass SRR0 as arg2 */ \ li r5,0; /* Pass zero as arg3 */ \ - EXC_XFER_EE_LITE(0x0400, handle_page_fault) + EXC_XFER_LITE(0x0400, handle_page_fault) #define ALIGNMENT_EXCEPTION \ START_EXCEPTION(Alignment) \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index d5d78c4ceef6..28e62598d0e8 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -319,7 +319,7 @@ interrupt_base: mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ andis. r10,r5,(ESR_ILK|ESR_DLK)@h bne 1f - EXC_XFER_EE_LITE(0x0300, handle_page_fault) + EXC_XFER_LITE(0x0300, handle_page_fault) 1: addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_EE_LITE(0x0300, CacheLockingException) -- cgit v1.2.2 From 9f2f79e3a3c19ae745d0439d6e0eed31df28de3c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 1 Mar 2012 15:47:44 +1100 Subject: powerpc: Disable interrupts in 64-bit kernel FP and vector faults If we get a floating point, altivec or vsx unavaible interrupt in kernel, we trigger a kernel error. There is no point preserving the interrupt state, in fact, that can even make debugging harder as the processor state might change (we may even preempt) between taking the exception and landing in a debugger. So just make those 3 disable interrupts unconditionally. Signed-off-by: Benjamin Herrenschmidt --- v2: On BookE only disable when hitting the kernel unavailable path, otherwise it will fail to restore softe as fast_exception_return doesn't do it. --- arch/powerpc/kernel/exceptions-64e.S | 7 +++---- arch/powerpc/kernel/exceptions-64s.S | 6 +++--- arch/powerpc/kernel/traps.c | 3 +++ 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 573613d747ac..3de9993c5c65 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -354,9 +354,9 @@ interrupt_end_book3e: /* we can probably do a shorter exception entry for that one... */ EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP) bne 1f /* if from user, just load it up */ + INTS_DISABLE_ALL bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD - INTS_RESTORE_HARD bl .kernel_fp_unavailable_exception BUG_OPCODE 1: ld r12,_MSR(r1) @@ -391,10 +391,9 @@ interrupt_end_book3e: /* Auxiliary Processor Unavailable Interrupt */ START_EXCEPTION(ap_unavailable); NORMAL_EXCEPTION_PROLOG(0xf20, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_KEEP) - addi r3,r1,STACK_FRAME_OVERHEAD + EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_DISABLE_ALL) bl .save_nvgprs - INTS_RESTORE_HARD + addi r3,r1,STACK_FRAME_OVERHEAD bl .unknown_exception b .ret_from_except diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d8ff6d37fc4d..0fb42ae21694 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -762,8 +762,8 @@ fp_unavailable_common: EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) bne 1f /* if from user, just load it up */ bl .save_nvgprs + DISABLE_INTS addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS bl .kernel_fp_unavailable_exception BUG_OPCODE 1: bl .load_up_fpu @@ -782,8 +782,8 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif bl .save_nvgprs + DISABLE_INTS addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS bl .altivec_unavailable_exception b .ret_from_except @@ -798,8 +798,8 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif bl .save_nvgprs + DISABLE_INTS addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS bl .vsx_unavailable_exception b .ret_from_except diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5d40e592ffcb..a750409ccc4e 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -247,6 +247,9 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) addr, regs->nip, regs->link, code); } + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; -- cgit v1.2.2 From 9424fabf8617c15e18a5ffd29bc3bcfa36620473 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 5 Mar 2012 10:55:04 +1100 Subject: powerpc: Fix 64-bit BookE FP unavailable exceptions We were using CR0.EQ after EXCEPTION_COMMON, hoping it still contained whether we came from userspace or kernel space. However, under some circumstances, EXCEPTION_COMMON will call C code and clobber non-volatile registers, so we really need to re-load the previous MSR from the stackframe and re-test. While there, invert the condition to make the fast path more obvious and remove the BUG_OPCODE which was a debugging leftover and call .ret_from_except as we should. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64e.S | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 3de9993c5c65..c4c34665c221 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -353,15 +353,16 @@ interrupt_end_book3e: NORMAL_EXCEPTION_PROLOG(0x800, PROLOG_ADDITION_NONE) /* we can probably do a shorter exception entry for that one... */ EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP) - bne 1f /* if from user, just load it up */ - INTS_DISABLE_ALL + ld r12,_MSR(r1) + andi. r0,r12,MSR_PR; + beq- 1f + bl .load_up_fpu + b fast_exception_return +1: INTS_DISABLE_ALL bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl .kernel_fp_unavailable_exception - BUG_OPCODE -1: ld r12,_MSR(r1) - bl .load_up_fpu - b fast_exception_return + b .ret_from_except /* Decrementer Interrupt */ MASKABLE_EXCEPTION(0x900, decrementer, .timer_interrupt, ACK_DEC) -- cgit v1.2.2 From d9ada91ae2969ae6b6dc3574fd08a6ebda5df766 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 2 Mar 2012 11:33:52 +1100 Subject: powerpc: Replace mfmsr instructions with load from PACA kernel_msr field On 64-bit, the mfmsr instruction can be quite slow, slower than loading a field from the cache-hot PACA, which happens to already contain the value we want in most cases. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/entry_64.S | 14 +++++--------- arch/powerpc/kernel/exceptions-64s.S | 5 ++--- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index cc030b73174b..c513beb78b3b 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -557,10 +557,8 @@ _GLOBAL(ret_from_except_lite) #ifdef CONFIG_PPC_BOOK3E wrteei 0 #else - mfmsr r10 /* Get current interrupt state */ - rldicl r9,r10,48,1 /* clear MSR_EE */ - rotldi r9,r9,16 - mtmsrd r9,1 /* Update machine state */ + ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */ + mtmsrd r10,1 /* Update machine state */ #endif /* CONFIG_PPC_BOOK3E */ #ifdef CONFIG_PREEMPT @@ -625,8 +623,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) * userspace and we take an exception after restoring r13, * we end up corrupting the userspace r13 value. */ - mfmsr r4 - andc r4,r4,r0 /* r0 contains MSR_RI here */ + ld r4,PACAKMSR(r13) /* Get kernel MSR without EE */ + andc r4,r4,r0 /* r0 contains MSR_RI here */ mtmsrd r4,1 /* @@ -686,9 +684,7 @@ do_work: #ifdef CONFIG_PPC_BOOK3E wrteei 0 #else - mfmsr r10 - rldicl r10,r10,48,1 - rotldi r10,r10,16 + ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */ mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ li r0,0 diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 0fb42ae21694..02448ea58ad3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -848,9 +848,8 @@ fast_exception_return: REST_GPR(0, r1) REST_8GPRS(2, r1) - mfmsr r10 - rldicl r10,r10,48,1 /* clear EE */ - rldicr r10,r10,16,61 /* clear RI (LE is 0 already) */ + ld r10,PACAKMSR(r13) + clrrdi r10,r10,2 /* clear RI */ mtmsrd r10,1 mtspr SPRN_SRR1,r12 -- cgit v1.2.2 From eb740b5f3e6559a8f1c22e2505914d07f9632881 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 27 Feb 2012 20:04:04 +0000 Subject: powerpc/eeh: Introduce EEH device Original EEH implementation depends on struct pci_dn heavily. However, EEH shouldn't depend on that actually because EEH needn't share much information with other PCI components. That's to say, EEH should have worked independently. The patch introduces struct eeh_dev so that EEH core components needn't be working based on struct pci_dn in future. Also, struct pci_dn, struct eeh_dev instances are created in dynamic fasion and the binding with EEH device, OF node, PCI device is implemented as well. The EEH devices are created after PHBs are detected and initialized, but PCI emunation hasn't started yet. Apart from that, PHB might be created dynamically through DLPAR component and the EEH devices should be creatd as well. Another case might be OF node is created dynamically by DR (Dynamic Reconfiguration), which has been defined by PAPR. For those OF nodes created by DR, EEH devices should be also created accordingly. The binding between EEH device and OF node is done while the EEH device is initially created. The binding between EEH device and PCI device should be done after PCI emunation is done. Besides, PCI hotplug also needs the binding so that the EEH devices could be traced from the newly coming PCI buses or PCI devices. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/of_platform.c | 6 +++++- arch/powerpc/kernel/rtas_pci.c | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index e1612dfb4a93..2049f2d00ffe 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -21,12 +21,13 @@ #include #include #include +#include #include #include #include #include -#include +#include #ifdef CONFIG_PPC_OF_PLATFORM_PCI @@ -66,6 +67,9 @@ static int __devinit of_pci_phb_probe(struct platform_device *dev) /* Init pci_dn data structures */ pci_devs_phb_init_dynamic(phb); + /* Create EEH devices for the PHB */ + eeh_dev_phb_init_dynamic(phb); + /* Register devices with EEH */ #ifdef CONFIG_EEH if (dev->dev.of_node->child) diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 6cd8f0196b6d..517bd86bc3f0 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -275,6 +275,9 @@ void __init find_and_init_phbs(void) of_node_put(root); pci_devs_phb_init(); + /* Create EEH devices for all PHBs */ + eeh_dev_phb_init(); + /* * pci_probe_only and pci_assign_all_buses can be set via properties * in chosen. -- cgit v1.2.2 From 7230c5644188cd9e3fb380cc97dde00c464a3ba7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 6 Mar 2012 18:27:59 +1100 Subject: powerpc: Rework lazy-interrupt handling The current implementation of lazy interrupts handling has some issues that this tries to address. We don't do the various workarounds we need to do when re-enabling interrupts in some cases such as when returning from an interrupt and thus we may still lose or get delayed decrementer or doorbell interrupts. The current scheme also makes it much harder to handle the external "edge" interrupts provided by some BookE processors when using the EPR facility (External Proxy) and the Freescale Hypervisor. Additionally, we tend to keep interrupts hard disabled in a number of cases, such as decrementer interrupts, external interrupts, or when a masked decrementer interrupt is pending. This is sub-optimal. This is an attempt at fixing it all in one go by reworking the way we do the lazy interrupt disabling from the ground up. The base idea is to replace the "hard_enabled" field with a "irq_happened" field in which we store a bit mask of what interrupt occurred while soft-disabled. When re-enabling, either via arch_local_irq_restore() or when returning from an interrupt, we can now decide what to do by testing bits in that field. We then implement replaying of the missed interrupts either by re-using the existing exception frame (in exception exit case) or via the creation of a new one from an assembly trampoline (in the arch_local_irq_enable case). This removes the need to play with the decrementer to try to create fake interrupts, among others. In addition, this adds a few refinements: - We no longer hard disable decrementer interrupts that occur while soft-disabled. We now simply bump the decrementer back to max (on BookS) or leave it stopped (on BookE) and continue with hard interrupts enabled, which means that we'll potentially get better sample quality from performance monitor interrupts. - Timer, decrementer and doorbell interrupts now hard-enable shortly after removing the source of the interrupt, which means they no longer run entirely hard disabled. Again, this will improve perf sample quality. - On Book3E 64-bit, we now make the performance monitor interrupt act as an NMI like Book3S (the necessary C code for that to work appear to already be present in the FSL perf code, notably calling nmi_enter instead of irq_enter). (This also fixes a bug where BookE perfmon interrupts could clobber r14 ... oops) - We could make "masked" decrementer interrupts act as NMIs when doing timer-based perf sampling to improve the sample quality. Signed-off-by-yet: Benjamin Herrenschmidt --- v2: - Add hard-enable to decrementer, timer and doorbells - Fix CR clobber in masked irq handling on BookE - Make embedded perf interrupt act as an NMI - Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want to retrigger an interrupt without preventing hard-enable v3: - Fix or vs. ori bug on Book3E - Fix enabling of interrupts for some exceptions on Book3E v4: - Fix resend of doorbells on return from interrupt on Book3E v5: - Rebased on top of my latest series, which involves some significant rework of some aspects of the patch. v6: - 32-bit compile fix - more compile fixes with various .config combos - factor out the asm code to soft-disable interrupts - remove the C wrapper around preempt_schedule_irq v7: - Fix a bug with hard irq state tracking on native power7 --- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/dbell.c | 2 + arch/powerpc/kernel/entry_64.S | 153 ++++++++++++++++++------ arch/powerpc/kernel/exceptions-64e.S | 221 +++++++++++++++++++++++------------ arch/powerpc/kernel/exceptions-64s.S | 150 +++++++++--------------- arch/powerpc/kernel/head_64.S | 24 ++-- arch/powerpc/kernel/idle.c | 6 +- arch/powerpc/kernel/idle_book3e.S | 25 +--- arch/powerpc/kernel/idle_power4.S | 24 +++- arch/powerpc/kernel/idle_power7.S | 23 +++- arch/powerpc/kernel/irq.c | 204 +++++++++++++++++++++++--------- arch/powerpc/kernel/process.c | 3 + arch/powerpc/kernel/time.c | 8 +- 13 files changed, 551 insertions(+), 294 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 04caee7d9bc1..cdd0d264415f 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -147,7 +147,7 @@ int main(void) DEFINE(PACAKBASE, offsetof(struct paca_struct, kernelbase)); DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); - DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); + DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 2cc451aaaca7..5b25c8060fd6 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -37,6 +37,8 @@ void doorbell_exception(struct pt_regs *regs) irq_enter(); + may_hard_irq_enable(); + smp_ipi_demux(); irq_exit(); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index c513beb78b3b..f8a7a1a1a9f4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -32,6 +32,7 @@ #include #include #include +#include /* * System calls. @@ -583,18 +584,72 @@ _GLOBAL(ret_from_except_lite) bne do_work #endif /* !CONFIG_PREEMPT */ + .globl fast_exc_return_irq +fast_exc_return_irq: restore: + /* + * This is the main kernel exit path, we first check if we + * have to change our interrupt state. + */ ld r5,SOFTE(r1) - TRACE_AND_RESTORE_IRQ(r5); + lbz r6,PACASOFTIRQEN(r13) + cmpwi cr1,r5,0 + cmpw cr0,r5,r6 + beq cr0,4f + + /* We do, handle disable first, which is easy */ + bne cr1,3f; + li r0,0 + stb r0,PACASOFTIRQEN(r13); + TRACE_DISABLE_INTS + b 4f - /* extract EE bit and use it to restore paca->hard_enabled */ - ld r3,_MSR(r1) - rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ - stb r4,PACAHARDIRQEN(r13) +3: /* + * We are about to soft-enable interrupts (we are hard disabled + * at this point). We check if there's anything that needs to + * be replayed first. + */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + bne- restore_check_irq_replay + + /* + * Get here when nothing happened while soft-disabled, just + * soft-enable and move-on. We will hard-enable as a side + * effect of rfi + */ +restore_no_replay: + TRACE_ENABLE_INTS + li r0,1 + stb r0,PACASOFTIRQEN(r13); + /* + * Final return path. BookE is handled in a different file + */ +4: #ifdef CONFIG_PPC_BOOK3E b .exception_return_book3e #else + /* + * Clear the reservation. If we know the CPU tracks the address of + * the reservation then we can potentially save some cycles and use + * a larx. On POWER6 and POWER7 this is significantly faster. + */ +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + ldarx r4,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + + /* + * Some code path such as load_up_fpu or altivec return directly + * here. They run entirely hard disabled and do not alter the + * interrupt state. They also don't use lwarx/stwcx. and thus + * are known not to leave dangling reservations. + */ + .globl fast_exception_return +fast_exception_return: + ld r3,_MSR(r1) ld r4,_CTR(r1) ld r0,_LINK(r1) mtctr r4 @@ -607,17 +662,6 @@ restore: andi. r0,r3,MSR_RI beq- unrecov_restore - /* - * Clear the reservation. If we know the CPU tracks the address of - * the reservation then we can potentially save some cycles and use - * a larx. On POWER6 and POWER7 this is significantly faster. - */ -BEGIN_FTR_SECTION - stdcx. r0,0,r1 /* to clear the reservation */ -FTR_SECTION_ELSE - ldarx r4,0,r1 -ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - /* * Clear RI before restoring r13. If we are returning to * userspace and we take an exception after restoring r13, @@ -629,7 +673,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) /* * r13 is our per cpu area, only restore it if we are returning to - * userspace + * userspace the value stored in the stack frame may belong to + * another CPU. */ andi. r0,r3,MSR_PR beq 1f @@ -654,6 +699,55 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) #endif /* CONFIG_PPC_BOOK3E */ + /* + * Something did happen, check if a re-emit is needed + * (this also clears paca->irq_happened) + */ +restore_check_irq_replay: + /* XXX: We could implement a fast path here where we check + * for irq_happened being just 0x01, in which case we can + * clear it and return. That means that we would potentially + * miss a decrementer having wrapped all the way around. + * + * Still, this might be useful for things like hash_page + */ + bl .__check_irq_replay + cmpwi cr0,r3,0 + beq restore_no_replay + + /* + * We need to re-emit an interrupt. We do so by re-using our + * existing exception frame. We first change the trap value, + * but we need to ensure we preserve the low nibble of it + */ + ld r4,_TRAP(r1) + clrldi r4,r4,60 + or r4,r4,r3 + std r4,_TRAP(r1) + + /* + * Then find the right handler and call it. Interrupts are + * still soft-disabled and we keep them that way. + */ + cmpwi cr0,r3,0x500 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl .do_IRQ + b .ret_from_except +1: cmpwi cr0,r3,0x900 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl .timer_interrupt + b .ret_from_except +#ifdef CONFIG_PPC_BOOK3E +1: cmpwi cr0,r3,0x280 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl .doorbell_exception + b .ret_from_except +#endif /* CONFIG_PPC_BOOK3E */ +1: b .ret_from_except /* What else to do here ? */ + do_work: #ifdef CONFIG_PREEMPT andi. r0,r3,MSR_PR /* Returning to user mode? */ @@ -666,18 +760,11 @@ do_work: crandc eq,cr1*4+eq,eq bne restore - /* Here we are preempting the current task. - * - * Ensure interrupts are soft-disabled. We also properly mark - * the PACA to reflect the fact that they are hard-disabled - * and trace the change + /* + * Here we are preempting the current task. We want to make + * sure we are soft-disabled first */ - li r0,0 - stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) - TRACE_DISABLE_INTS - - /* Call the scheduler with soft IRQs off */ + SOFT_DISABLE_INTS(r3,r4) 1: bl .preempt_schedule_irq /* Hard-disable interrupts again (and update PACA) */ @@ -687,8 +774,8 @@ do_work: ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */ mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ - li r0,0 - stb r0,PACAHARDIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) /* Re-test flags and eventually loop */ clrrdi r9,r1,THREAD_SHIFT @@ -710,14 +797,12 @@ user_work: andi. r0,r4,_TIF_NEED_RESCHED beq 1f - li r5,1 - TRACE_AND_RESTORE_IRQ(r5); + bl .restore_interrupts bl .schedule b .ret_from_except_lite 1: bl .save_nvgprs - li r5,1 - TRACE_AND_RESTORE_IRQ(r5); + bl .restore_interrupts addi r3,r1,STACK_FRAME_OVERHEAD bl .do_notify_resume b .ret_from_except diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index c4c34665c221..7215cc2495df 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -24,6 +24,7 @@ #include #include #include +#include /* XXX This will ultimately add space for a special exception save * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... @@ -77,59 +78,55 @@ #define SPRN_MC_SRR1 SPRN_MCSRR1 #define NORMAL_EXCEPTION_PROLOG(n, addition) \ - EXCEPTION_PROLOG(n, GEN, addition##_GEN) + EXCEPTION_PROLOG(n, GEN, addition##_GEN(n)) #define CRIT_EXCEPTION_PROLOG(n, addition) \ - EXCEPTION_PROLOG(n, CRIT, addition##_CRIT) + EXCEPTION_PROLOG(n, CRIT, addition##_CRIT(n)) #define DBG_EXCEPTION_PROLOG(n, addition) \ - EXCEPTION_PROLOG(n, DBG, addition##_DBG) + EXCEPTION_PROLOG(n, DBG, addition##_DBG(n)) #define MC_EXCEPTION_PROLOG(n, addition) \ - EXCEPTION_PROLOG(n, MC, addition##_MC) + EXCEPTION_PROLOG(n, MC, addition##_MC(n)) /* Variants of the "addition" argument for the prolog */ -#define PROLOG_ADDITION_NONE_GEN -#define PROLOG_ADDITION_NONE_CRIT -#define PROLOG_ADDITION_NONE_DBG -#define PROLOG_ADDITION_NONE_MC +#define PROLOG_ADDITION_NONE_GEN(n) +#define PROLOG_ADDITION_NONE_CRIT(n) +#define PROLOG_ADDITION_NONE_DBG(n) +#define PROLOG_ADDITION_NONE_MC(n) -#define PROLOG_ADDITION_MASKABLE_GEN \ +#define PROLOG_ADDITION_MASKABLE_GEN(n) \ lbz r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \ cmpwi cr0,r11,0; /* yes -> go out of line */ \ - beq masked_interrupt_book3e; + beq masked_interrupt_book3e_##n -#define PROLOG_ADDITION_2REGS_GEN \ +#define PROLOG_ADDITION_2REGS_GEN(n) \ std r14,PACA_EXGEN+EX_R14(r13); \ std r15,PACA_EXGEN+EX_R15(r13) -#define PROLOG_ADDITION_1REG_GEN \ +#define PROLOG_ADDITION_1REG_GEN(n) \ std r14,PACA_EXGEN+EX_R14(r13); -#define PROLOG_ADDITION_2REGS_CRIT \ +#define PROLOG_ADDITION_2REGS_CRIT(n) \ std r14,PACA_EXCRIT+EX_R14(r13); \ std r15,PACA_EXCRIT+EX_R15(r13) -#define PROLOG_ADDITION_2REGS_DBG \ +#define PROLOG_ADDITION_2REGS_DBG(n) \ std r14,PACA_EXDBG+EX_R14(r13); \ std r15,PACA_EXDBG+EX_R15(r13) -#define PROLOG_ADDITION_2REGS_MC \ +#define PROLOG_ADDITION_2REGS_MC(n) \ std r14,PACA_EXMC+EX_R14(r13); \ std r15,PACA_EXMC+EX_R15(r13) -#define PROLOG_ADDITION_DOORBELL_GEN \ - lbz r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \ - cmpwi cr0,r11,0; /* yes -> go out of line */ \ - beq masked_doorbell_book3e - /* Core exception code for all exceptions except TLB misses. * XXX: Needs to make SPRN_SPRG_GEN depend on exception type */ #define EXCEPTION_COMMON(n, excf, ints) \ +exc_##n##_common: \ std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ @@ -167,20 +164,21 @@ std r0,RESULT(r1); /* clear regs->result */ \ ints; -/* Variants for the "ints" argument */ +/* Variants for the "ints" argument. This one does nothing when we want + * to keep interrupts in their original state + */ #define INTS_KEEP -#define INTS_DISABLE_SOFT \ - stb r0,PACASOFTIRQEN(r13); /* mark interrupts soft-disabled */ \ - TRACE_DISABLE_INTS; -#define INTS_DISABLE_HARD \ - stb r0,PACAHARDIRQEN(r13); /* and hard disabled */ -#define INTS_DISABLE_ALL \ - INTS_DISABLE_SOFT \ - INTS_DISABLE_HARD - -/* This is called by exceptions that used INTS_KEEP (that is did not clear - * neither soft nor hard IRQ indicators in the PACA. This will restore MSR:EE - * to it's previous value + +/* This second version is meant for exceptions that don't immediately + * hard-enable. We set a bit in paca->irq_happened to ensure that + * a subsequent call to arch_local_irq_restore() will properly + * hard-enable and avoid the fast-path + */ +#define INTS_DISABLE SOFT_DISABLE_INTS(r3,r4) + +/* This is called by exceptions that used INTS_KEEP (that did not touch + * irq indicators in the PACA). This will restore MSR:EE to it's previous + * value * * XXX In the long run, we may want to open-code it in order to separate the * load from the wrtee, thus limiting the latency caused by the dependency @@ -238,7 +236,7 @@ exc_##n##_bad_stack: \ #define MASKABLE_EXCEPTION(trapnum, label, hdlr, ack) \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, PROLOG_ADDITION_MASKABLE) \ - EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE_ALL) \ + EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE) \ ack(r8); \ CHECK_NAPPING(); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ @@ -289,7 +287,7 @@ interrupt_end_book3e: /* Critical Input Interrupt */ START_EXCEPTION(critical_input); CRIT_EXCEPTION_PROLOG(0x100, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE_ALL) +// EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE) // bl special_reg_save_crit // CHECK_NAPPING(); // addi r3,r1,STACK_FRAME_OVERHEAD @@ -300,7 +298,7 @@ interrupt_end_book3e: /* Machine Check Interrupt */ START_EXCEPTION(machine_check); CRIT_EXCEPTION_PROLOG(0x200, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE_ALL) +// EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE) // bl special_reg_save_mc // addi r3,r1,STACK_FRAME_OVERHEAD // CHECK_NAPPING(); @@ -313,7 +311,7 @@ interrupt_end_book3e: NORMAL_EXCEPTION_PROLOG(0x300, PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_DISABLE_ALL) + EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_DISABLE) b storage_fault_common /* Instruction Storage Interrupt */ @@ -321,7 +319,7 @@ interrupt_end_book3e: NORMAL_EXCEPTION_PROLOG(0x400, PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_DISABLE_ALL) + EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_DISABLE) b storage_fault_common /* External Input Interrupt */ @@ -339,12 +337,11 @@ interrupt_end_book3e: START_EXCEPTION(program); NORMAL_EXCEPTION_PROLOG(0x700, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE_SOFT) + EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE) std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXGEN+EX_R14(r13) bl .save_nvgprs - INTS_RESTORE_HARD bl .program_check_exception b .ret_from_except @@ -358,7 +355,7 @@ interrupt_end_book3e: beq- 1f bl .load_up_fpu b fast_exception_return -1: INTS_DISABLE_ALL +1: INTS_DISABLE bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl .kernel_fp_unavailable_exception @@ -373,7 +370,7 @@ interrupt_end_book3e: /* Watchdog Timer Interrupt */ START_EXCEPTION(watchdog); CRIT_EXCEPTION_PROLOG(0x9f0, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE_ALL) +// EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE) // bl special_reg_save_crit // CHECK_NAPPING(); // addi r3,r1,STACK_FRAME_OVERHEAD @@ -392,7 +389,7 @@ interrupt_end_book3e: /* Auxiliary Processor Unavailable Interrupt */ START_EXCEPTION(ap_unavailable); NORMAL_EXCEPTION_PROLOG(0xf20, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_DISABLE_ALL) + EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_DISABLE) bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl .unknown_exception @@ -450,7 +447,7 @@ interrupt_end_book3e: mfspr r15,SPRN_SPRG_CRIT_SCRATCH mtspr SPRN_SPRG_GEN_SCRATCH,r15 mfspr r14,SPRN_DBSR - EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE_ALL) + EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE) std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD mr r4,r14 @@ -465,7 +462,7 @@ kernel_dbg_exc: /* Debug exception as a debug interrupt*/ START_EXCEPTION(debug_debug); - DBG_EXCEPTION_PROLOG(0xd00, PROLOG_ADDITION_2REGS) + DBG_EXCEPTION_PROLOG(0xd08, PROLOG_ADDITION_2REGS) /* * If there is a single step or branch-taken exception in an @@ -515,7 +512,7 @@ kernel_dbg_exc: mfspr r15,SPRN_SPRG_DBG_SCRATCH mtspr SPRN_SPRG_GEN_SCRATCH,r15 mfspr r14,SPRN_DBSR - EXCEPTION_COMMON(0xd00, PACA_EXDBG, INTS_DISABLE_ALL) + EXCEPTION_COMMON(0xd08, PACA_EXDBG, INTS_DISABLE) std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD mr r4,r14 @@ -525,21 +522,20 @@ kernel_dbg_exc: bl .DebugException b .ret_from_except - MASKABLE_EXCEPTION(0x260, perfmon, .performance_monitor_exception, ACK_NONE) - -/* Doorbell interrupt */ - START_EXCEPTION(doorbell) - NORMAL_EXCEPTION_PROLOG(0x2070, PROLOG_ADDITION_DOORBELL) - EXCEPTION_COMMON(0x2070, PACA_EXGEN, INTS_DISABLE_ALL) - CHECK_NAPPING() + START_EXCEPTION(perfmon); + NORMAL_EXCEPTION_PROLOG(0x260, PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x260, PACA_EXGEN, INTS_DISABLE) addi r3,r1,STACK_FRAME_OVERHEAD - bl .doorbell_exception + bl .performance_monitor_exception b .ret_from_except_lite +/* Doorbell interrupt */ + MASKABLE_EXCEPTION(0x280, doorbell, .doorbell_exception, ACK_NONE) + /* Doorbell critical Interrupt */ START_EXCEPTION(doorbell_crit); - CRIT_EXCEPTION_PROLOG(0x2080, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x2080, PACA_EXCRIT, INTS_DISABLE_ALL) + CRIT_EXCEPTION_PROLOG(0x2a0, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x2a0, PACA_EXCRIT, INTS_DISABLE) // bl special_reg_save_crit // CHECK_NAPPING(); // addi r3,r1,STACK_FRAME_OVERHEAD @@ -547,36 +543,114 @@ kernel_dbg_exc: // b ret_from_crit_except b . +/* Guest Doorbell */ MASKABLE_EXCEPTION(0x2c0, guest_doorbell, .unknown_exception, ACK_NONE) - MASKABLE_EXCEPTION(0x2e0, guest_doorbell_crit, .unknown_exception, ACK_NONE) - MASKABLE_EXCEPTION(0x310, hypercall, .unknown_exception, ACK_NONE) - MASKABLE_EXCEPTION(0x320, ehpriv, .unknown_exception, ACK_NONE) +/* Guest Doorbell critical Interrupt */ + START_EXCEPTION(guest_doorbell_crit); + CRIT_EXCEPTION_PROLOG(0x2e0, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x2e0, PACA_EXCRIT, INTS_DISABLE) +// bl special_reg_save_crit +// CHECK_NAPPING(); +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .guest_doorbell_critical_exception +// b ret_from_crit_except + b . + +/* Hypervisor call */ + START_EXCEPTION(hypercall); + NORMAL_EXCEPTION_PROLOG(0x310, PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x310, PACA_EXGEN, INTS_KEEP) + addi r3,r1,STACK_FRAME_OVERHEAD + bl .save_nvgprs + INTS_RESTORE_HARD + bl .unknown_exception + b .ret_from_except + +/* Embedded Hypervisor priviledged */ + START_EXCEPTION(ehpriv); + NORMAL_EXCEPTION_PROLOG(0x320, PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x320, PACA_EXGEN, INTS_KEEP) + addi r3,r1,STACK_FRAME_OVERHEAD + bl .save_nvgprs + INTS_RESTORE_HARD + bl .unknown_exception + b .ret_from_except /* - * An interrupt came in while soft-disabled; clear EE in SRR1, - * clear paca->hard_enabled and return. + * An interrupt came in while soft-disabled; We mark paca->irq_happened + * accordingly and if the interrupt is level sensitive, we hard disable */ -masked_doorbell_book3e: - mtcr r10 - /* Resend the doorbell to fire again when ints enabled */ - mfspr r10,SPRN_PIR - PPC_MSGSND(r10) - b masked_interrupt_book3e_common -masked_interrupt_book3e: +masked_interrupt_book3e_0x500: + /* XXX When adding support for EPR, use PACA_IRQ_EE_EDGE */ + li r11,PACA_IRQ_EE + b masked_interrupt_book3e_full_mask + +masked_interrupt_book3e_0x900: + ACK_DEC(r11); + li r11,PACA_IRQ_DEC + b masked_interrupt_book3e_no_mask +masked_interrupt_book3e_0x980: + ACK_FIT(r11); + li r11,PACA_IRQ_DEC + b masked_interrupt_book3e_no_mask +masked_interrupt_book3e_0x280: +masked_interrupt_book3e_0x2c0: + li r11,PACA_IRQ_DBELL + b masked_interrupt_book3e_no_mask + +masked_interrupt_book3e_no_mask: mtcr r10 -masked_interrupt_book3e_common: - stb r11,PACAHARDIRQEN(r13) + lbz r10,PACAIRQHAPPENED(r13) + or r10,r10,r11 + stb r10,PACAIRQHAPPENED(r13) + b 1f +masked_interrupt_book3e_full_mask: + mtcr r10 + lbz r10,PACAIRQHAPPENED(r13) + or r10,r10,r11 + stb r10,PACAIRQHAPPENED(r13) mfspr r10,SPRN_SRR1 rldicl r11,r10,48,1 /* clear MSR_EE */ rotldi r10,r11,16 mtspr SPRN_SRR1,r10 - ld r10,PACA_EXGEN+EX_R10(r13); /* restore registers */ +1: ld r10,PACA_EXGEN+EX_R10(r13); ld r11,PACA_EXGEN+EX_R11(r13); mfspr r13,SPRN_SPRG_GEN_SCRATCH; rfi b . +/* + * Called from arch_local_irq_enable when an interrupt needs + * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280 + * to indicate the kind of interrupt. MSR:EE is already off. + * We generate a stackframe like if a real interrupt had happened. + * + * Note: While MSR:EE is off, we need to make sure that _MSR + * in the generated frame has EE set to 1 or the exception + * handler will not properly re-enable them. + */ +_GLOBAL(__replay_interrupt) + /* We are going to jump to the exception common code which + * will retrieve various register values from the PACA which + * we don't give a damn about. + */ + mflr r10 + mfmsr r11 + mfcr r4 + mtspr SPRN_SPRG_GEN_SCRATCH,r13; + std r1,PACA_EXGEN+EX_R1(r13); + stw r4,PACA_EXGEN+EX_CR(r13); + ori r11,r11,MSR_EE + subi r1,r1,INT_FRAME_SIZE; + cmpwi cr0,r3,0x500 + beq exc_0x500_common + cmpwi cr0,r3,0x900 + beq exc_0x900_common + cmpwi cr0,r3,0x280 + beq exc_0x280_common + blr + /* * This is called from 0x300 and 0x400 handlers after the prologs with @@ -679,6 +753,8 @@ BAD_STACK_TRAMPOLINE(0x000) BAD_STACK_TRAMPOLINE(0x100) BAD_STACK_TRAMPOLINE(0x200) BAD_STACK_TRAMPOLINE(0x260) +BAD_STACK_TRAMPOLINE(0x280) +BAD_STACK_TRAMPOLINE(0x2a0) BAD_STACK_TRAMPOLINE(0x2c0) BAD_STACK_TRAMPOLINE(0x2e0) BAD_STACK_TRAMPOLINE(0x300) @@ -696,11 +772,10 @@ BAD_STACK_TRAMPOLINE(0xa00) BAD_STACK_TRAMPOLINE(0xb00) BAD_STACK_TRAMPOLINE(0xc00) BAD_STACK_TRAMPOLINE(0xd00) +BAD_STACK_TRAMPOLINE(0xd08) BAD_STACK_TRAMPOLINE(0xe00) BAD_STACK_TRAMPOLINE(0xf00) BAD_STACK_TRAMPOLINE(0xf20) -BAD_STACK_TRAMPOLINE(0x2070) -BAD_STACK_TRAMPOLINE(0x2080) .globl bad_stack_book3e bad_stack_book3e: diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 02448ea58ad3..2d0868a4e2f0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -12,6 +12,7 @@ * */ +#include #include #include @@ -356,34 +357,60 @@ do_stab_bolted_pSeries: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) /* - * An interrupt came in while soft-disabled; clear EE in SRR1, - * clear paca->hard_enabled and return. + * An interrupt came in while soft-disabled. We set paca->irq_happened, + * then, if it was a decrementer interrupt, we bump the dec to max and + * and return, else we hard disable and return. This is called with + * r10 containing the value to OR to the paca field. */ -masked_interrupt: - stb r10,PACAHARDIRQEN(r13) - mtcrf 0x80,r9 - ld r9,PACA_EXGEN+EX_R9(r13) - mfspr r10,SPRN_SRR1 - rldicl r10,r10,48,1 /* clear MSR_EE */ - rotldi r10,r10,16 - mtspr SPRN_SRR1,r10 - ld r10,PACA_EXGEN+EX_R10(r13) - GET_SCRATCH0(r13) - rfid +#define MASKED_INTERRUPT(_H) \ +masked_##_H##interrupt: \ + std r11,PACA_EXGEN+EX_R11(r13); \ + lbz r11,PACAIRQHAPPENED(r13); \ + or r11,r11,r10; \ + stb r11,PACAIRQHAPPENED(r13); \ + andi. r10,r10,PACA_IRQ_DEC; \ + beq 1f; \ + lis r10,0x7fff; \ + ori r10,r10,0xffff; \ + mtspr SPRN_DEC,r10; \ + b 2f; \ +1: mfspr r10,SPRN_##_H##SRR1; \ + rldicl r10,r10,48,1; /* clear MSR_EE */ \ + rotldi r10,r10,16; \ + mtspr SPRN_##_H##SRR1,r10; \ +2: mtcrf 0x80,r9; \ + ld r9,PACA_EXGEN+EX_R9(r13); \ + ld r10,PACA_EXGEN+EX_R10(r13); \ + ld r11,PACA_EXGEN+EX_R11(r13); \ + GET_SCRATCH0(r13); \ + ##_H##rfid; \ b . + + MASKED_INTERRUPT() + MASKED_INTERRUPT(H) -masked_Hinterrupt: - stb r10,PACAHARDIRQEN(r13) - mtcrf 0x80,r9 - ld r9,PACA_EXGEN+EX_R9(r13) - mfspr r10,SPRN_HSRR1 - rldicl r10,r10,48,1 /* clear MSR_EE */ - rotldi r10,r10,16 - mtspr SPRN_HSRR1,r10 - ld r10,PACA_EXGEN+EX_R10(r13) - GET_SCRATCH0(r13) - hrfid - b . +/* + * Called from arch_local_irq_enable when an interrupt needs + * to be resent. r3 contains 0x500 or 0x900 to indicate which + * kind of interrupt. MSR:EE is already off. We generate a + * stackframe like if a real interrupt had happened. + * + * Note: While MSR:EE is off, we need to make sure that _MSR + * in the generated frame has EE set to 1 or the exception + * handler will not properly re-enable them. + */ +_GLOBAL(__replay_interrupt) + /* We are going to jump to the exception common code which + * will retrieve various register values from the PACA which + * we don't give a damn about, so we don't bother storing them. + */ + mfmsr r12 + mflr r11 + mfcr r9 + ori r12,r12,MSR_EE + andi. r3,r3,0x0800 + bne decrementer_common + b hardware_interrupt_common #ifdef CONFIG_PPC_PSERIES /* @@ -793,7 +820,8 @@ vsx_unavailable_common: EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN) #ifdef CONFIG_VSX BEGIN_FTR_SECTION - bne .load_up_vsx + beq 1f + b .load_up_vsx 1: END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif @@ -807,65 +835,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) .globl __end_handlers __end_handlers: -/* - * Return from an exception with minimal checks. - * The caller is assumed to have done EXCEPTION_PROLOG_COMMON. - * If interrupts have been enabled, or anything has been - * done that might have changed the scheduling status of - * any task or sent any task a signal, you should use - * ret_from_except or ret_from_except_lite instead of this. - */ -fast_exc_return_irq: /* restores irq state too */ - ld r3,SOFTE(r1) - TRACE_AND_RESTORE_IRQ(r3); - ld r12,_MSR(r1) - rldicl r4,r12,49,63 /* get MSR_EE to LSB */ - stb r4,PACAHARDIRQEN(r13) /* restore paca->hard_enabled */ - b 1f - - .globl fast_exception_return -fast_exception_return: - ld r12,_MSR(r1) -1: ld r11,_NIP(r1) - andi. r3,r12,MSR_RI /* check if RI is set */ - beq- unrecov_fer - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING - andi. r3,r12,MSR_PR - beq 2f - ACCOUNT_CPU_USER_EXIT(r3, r4) -2: -#endif - - ld r3,_CCR(r1) - ld r4,_LINK(r1) - ld r5,_CTR(r1) - ld r6,_XER(r1) - mtcr r3 - mtlr r4 - mtctr r5 - mtxer r6 - REST_GPR(0, r1) - REST_8GPRS(2, r1) - - ld r10,PACAKMSR(r13) - clrrdi r10,r10,2 /* clear RI */ - mtmsrd r10,1 - - mtspr SPRN_SRR1,r12 - mtspr SPRN_SRR0,r11 - REST_4GPRS(10, r1) - ld r1,GPR1(r1) - rfid - b . /* prevent speculative execution */ - -unrecov_fer: - bl .save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception - b 1b - - /* * Hash table stuff */ @@ -905,19 +874,16 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) * r4 contains the required access permissions * r5 contains the trap number * - * at return r3 = 0 for success + * at return r3 = 0 for success, 1 for page fault, negative for error */ bl .hash_page /* build HPTE if possible */ cmpdi r3,0 /* see if hash_page succeeded */ - /* - * Here we have interrupts hard-disabled, so it is sufficient - * to restore paca->{soft,hard}_enable and get out. - */ + /* Success */ beq fast_exc_return_irq /* Return from exception on success */ - /* For a hash failure, we don't bother re-enabling interrupts */ - ble- 13f + /* Error */ + blt- 13f /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 40759fbfb171..58bddee8e1e8 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -38,6 +38,7 @@ #include #include #include +#include /* The physical memory is laid out such that the secondary processor * spin code sits at 0x0000...0x00ff. On server, the vectors follow @@ -550,7 +551,8 @@ _GLOBAL(pmac_secondary_start) */ li r0,0 stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) @@ -601,9 +603,12 @@ __secondary_start: li r7,0 mtlr r7 - /* Mark interrupts both hard and soft disabled */ - stb r7,PACAHARDIRQEN(r13) + /* Mark interrupts soft and hard disabled (they might be enabled + * in the PACA when doing hotplug) + */ stb r7,PACASOFTIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) /* enable MMU and jump to start_secondary */ LOAD_REG_ADDR(r3, .start_secondary_prolog) @@ -750,13 +755,18 @@ _INIT_GLOBAL(start_here_common) /* Load the TOC (virtual address) */ ld r2,PACATOC(r13) + /* Do more system initializations in virtual mode */ bl .setup_system - /* Load up the kernel context */ -5: li r5,0 - stb r5,PACASOFTIRQEN(r13) /* Soft Disabled */ - stb r5,PACAHARDIRQEN(r13) /* Hard Disabled on others */ + /* Mark interrupts soft and hard disabled (they might be enabled + * in the PACA when doing hotplug) + */ + li r0,0 + stb r0,PACASOFTIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + /* Generic kernel entry */ bl .start_kernel /* Not reached */ diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 0a48bf5db6c8..8f7a2b62863d 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -84,7 +84,11 @@ void cpu_idle(void) start_critical_timings(); - local_irq_enable(); + /* Some power_save functions return with + * interrupts enabled, some don't. + */ + if (irqs_disabled()) + local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); } else { diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S index 16c002d6bdf1..ff007b59448d 100644 --- a/arch/powerpc/kernel/idle_book3e.S +++ b/arch/powerpc/kernel/idle_book3e.S @@ -29,43 +29,30 @@ _GLOBAL(book3e_idle) wrteei 0 /* Now check if an interrupt came in while we were soft disabled - * since we may otherwise lose it (doorbells etc...). We know - * that since PACAHARDIRQEN will have been cleared in that case. + * since we may otherwise lose it (doorbells etc...). */ - lbz r3,PACAHARDIRQEN(r13) + lbz r3,PACAIRQHAPPENED(r13) cmpwi cr0,r3,0 - beqlr + bnelr - /* Now we are going to mark ourselves as soft and hard enables in + /* Now we are going to mark ourselves as soft and hard enabled in * order to be able to take interrupts while asleep. We inform lockdep * of that. We don't actually turn interrupts on just yet tho. */ #ifdef CONFIG_TRACE_IRQFLAGS stdu r1,-128(r1) bl .trace_hardirqs_on + addi r1,r1,128 #endif li r0,1 stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) /* Interrupts will make use return to LR, so get something we want * in there */ bl 1f - /* Hard disable interrupts again */ - wrteei 0 - - /* Mark them off again in the PACA as well */ - li r0,0 - stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) - - /* Tell lockdep about it */ -#ifdef CONFIG_TRACE_IRQFLAGS - bl .trace_hardirqs_off - addi r1,r1,128 -#endif + /* And return (interrupts are on) */ ld r0,16(r1) mtlr r0 blr diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index ba3195478600..d8cdba4c28b2 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -14,6 +14,7 @@ #include #include #include +#include #undef DEBUG @@ -29,14 +30,31 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) cmpwi 0,r4,0 beqlr - /* Go to NAP now */ + /* Hard disable interrupts */ mfmsr r7 rldicl r0,r7,48,1 rotldi r0,r0,16 - mtmsrd r0,1 /* hard-disable interrupts */ + mtmsrd r0,1 + + /* Check if something happened while soft-disabled */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + bnelr + + /* Soft-enable interrupts */ +#ifdef CONFIG_TRACE_IRQFLAGS + mflr r0 + std r0,16(r1) + stdu r1,-128(r1) + bl .trace_hardirqs_on + addi r1,r1,128 + ld r0,16(r1) + mtlr r0 +#endif /* CONFIG_TRACE_IRQFLAGS */ + + TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13) /* we'll hard-enable shortly */ - stb r0,PACAHARDIRQEN(r13) BEGIN_FTR_SECTION DSSALL sync diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index fcdff198da4b..0cdc9a392839 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -1,5 +1,5 @@ /* - * This file contains the power_save function for 970-family CPUs. + * This file contains the power_save function for Power7 CPUs. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -15,6 +15,7 @@ #include #include #include +#include #undef DEBUG @@ -51,9 +52,25 @@ _GLOBAL(power7_idle) rldicl r9,r9,48,1 rotldi r9,r9,16 mtmsrd r9,1 /* hard-disable interrupts */ + + /* Check if something happened while soft-disabled */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + beq 1f + addi r1,r1,INT_FRAME_SIZE + ld r0,16(r1) + mtlr r0 + blr + +1: /* We mark irqs hard disabled as this is the state we'll + * be in when returning and we need to tell arch_local_irq_restore() + * about it + */ + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + + /* We haven't lost state ... yet */ li r0,0 - stb r0,PACASOFTIRQEN(r13) /* we'll hard-enable shortly */ - stb r0,PACAHARDIRQEN(r13) stb r0,PACA_NAPSTATELOST(r13) /* Continue saving state */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 9b6e80668cfb..eb804e15b29b 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -95,14 +95,14 @@ extern int tau_interrupts(int); int distribute_irqs = 1; -static inline notrace unsigned long get_hard_enabled(void) +static inline notrace unsigned long get_irq_happened(void) { - unsigned long enabled; + unsigned long happened; __asm__ __volatile__("lbz %0,%1(13)" - : "=r" (enabled) : "i" (offsetof(struct paca_struct, hard_enabled))); + : "=r" (happened) : "i" (offsetof(struct paca_struct, irq_happened))); - return enabled; + return happened; } static inline notrace void set_soft_enabled(unsigned long enable) @@ -111,88 +111,167 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -static inline notrace void decrementer_check_overflow(void) +static inline notrace int decrementer_check_overflow(void) { - u64 now = get_tb_or_rtc(); - u64 *next_tb; - - preempt_disable(); - next_tb = &__get_cpu_var(decrementers_next_tb); - + u64 now = get_tb_or_rtc(); + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + if (now >= *next_tb) set_dec(1); - preempt_enable(); + return now >= *next_tb; } -notrace void arch_local_irq_restore(unsigned long en) +/* This is called whenever we are re-enabling interrupts + * and returns either 0 (nothing to do) or 500/900 if there's + * either an EE or a DEC to generate. + * + * This is called in two contexts: From arch_local_irq_restore() + * before soft-enabling interrupts, and from the exception exit + * path when returning from an interrupt from a soft-disabled to + * a soft enabled context. In both case we have interrupts hard + * disabled. + * + * We take care of only clearing the bits we handled in the + * PACA irq_happened field since we can only re-emit one at a + * time and we don't want to "lose" one. + */ +notrace unsigned int __check_irq_replay(void) { /* - * get_paca()->soft_enabled = en; - * Is it ever valid to use local_irq_restore(0) when soft_enabled is 1? - * That was allowed before, and in such a case we do need to take care - * that gcc will set soft_enabled directly via r13, not choose to use - * an intermediate register, lest we're preempted to a different cpu. + * We use local_paca rather than get_paca() to avoid all + * the debug_smp_processor_id() business in this low level + * function */ - set_soft_enabled(en); - if (!en) - return; + unsigned char happened = local_paca->irq_happened; -#ifdef CONFIG_PPC_STD_MMU_64 - if (firmware_has_feature(FW_FEATURE_ISERIES)) { - /* - * Do we need to disable preemption here? Not really: in the - * unlikely event that we're preempted to a different cpu in - * between getting r13, loading its lppaca_ptr, and loading - * its any_int, we might call iseries_handle_interrupts without - * an interrupt pending on the new cpu, but that's no disaster, - * is it? And the business of preempting us off the old cpu - * would itself involve a local_irq_restore which handles the - * interrupt to that cpu. - * - * But use "local_paca->lppaca_ptr" instead of "get_lppaca()" - * to avoid any preemption checking added into get_paca(). - */ - if (local_paca->lppaca_ptr->int_dword.any_int) - iseries_handle_interrupts(); + /* Clear bit 0 which we wouldn't clear otherwise */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + + /* + * Force the delivery of pending soft-disabled interrupts on PS3. + * Any HV call will have this side effect. + */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { + u64 tmp, tmp2; + lv1_get_version_info(&tmp, &tmp2); } -#endif /* CONFIG_PPC_STD_MMU_64 */ /* - * if (get_paca()->hard_enabled) return; - * But again we need to take care that gcc gets hard_enabled directly - * via r13, not choose to use an intermediate register, lest we're - * preempted to a different cpu in between the two instructions. + * We may have missed a decrementer interrupt. We check the + * decrementer itself rather than the paca irq_happened field + * in case we also had a rollover while hard disabled + */ + local_paca->irq_happened &= ~PACA_IRQ_DEC; + if (decrementer_check_overflow()) + return 0x900; + + /* Finally check if an external interrupt happened */ + local_paca->irq_happened &= ~PACA_IRQ_EE; + if (happened & PACA_IRQ_EE) + return 0x500; + +#ifdef CONFIG_PPC_BOOK3E + /* Finally check if an EPR external interrupt happened + * this bit is typically set if we need to handle another + * "edge" interrupt from within the MPIC "EPR" handler + */ + local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; + if (happened & PACA_IRQ_EE_EDGE) + return 0x500; + + local_paca->irq_happened &= ~PACA_IRQ_DBELL; + if (happened & PACA_IRQ_DBELL) + return 0x280; +#endif /* CONFIG_PPC_BOOK3E */ + + /* There should be nothing left ! */ + BUG_ON(local_paca->irq_happened != 0); + + return 0; +} + +notrace void arch_local_irq_restore(unsigned long en) +{ + unsigned char irq_happened; + unsigned int replay; + + /* Write the new soft-enabled value */ + set_soft_enabled(en); + if (!en) + return; + /* + * From this point onward, we can take interrupts, preempt, + * etc... unless we got hard-disabled. We check if an event + * happened. If none happened, we know we can just return. + * + * We may have preempted before the check below, in which case + * we are checking the "new" CPU instead of the old one. This + * is only a problem if an event happened on the "old" CPU. + * + * External interrupt events on non-iseries will have caused + * interrupts to be hard-disabled, so there is no problem, we + * cannot have preempted. + * + * That leaves us with EEs on iSeries or decrementer interrupts, + * which I decided to safely ignore. The preemption would have + * itself been the result of an interrupt, upon which return we + * will have checked for pending events on the old CPU. */ - if (get_hard_enabled()) + irq_happened = get_irq_happened(); + if (!irq_happened) return; /* - * Need to hard-enable interrupts here. Since currently disabled, - * no need to take further asm precautions against preemption; but - * use local_paca instead of get_paca() to avoid preemption checking. + * We need to hard disable to get a trusted value from + * __check_irq_replay(). We also need to soft-disable + * again to avoid warnings in there due to the use of + * per-cpu variables. + * + * We know that if the value in irq_happened is exactly 0x01 + * then we are already hard disabled (there are other less + * common cases that we'll ignore for now), so we skip the + * (expensive) mtmsrd. */ - local_paca->hard_enabled = en; + if (unlikely(irq_happened != PACA_IRQ_HARD_DIS)) + __hard_irq_disable(); + set_soft_enabled(0); /* - * Trigger the decrementer if we have a pending event. Some processors - * only trigger on edge transitions of the sign bit. We might also - * have disabled interrupts long enough that the decrementer wrapped - * to positive. + * Check if anything needs to be re-emitted. We haven't + * soft-enabled yet to avoid warnings in decrementer_check_overflow + * accessing per-cpu variables */ - decrementer_check_overflow(); + replay = __check_irq_replay(); + + /* We can soft-enable now */ + set_soft_enabled(1); /* - * Force the delivery of pending soft-disabled interrupts on PS3. - * Any HV call will have this side effect. + * And replay if we have to. This will return with interrupts + * hard-enabled. */ - if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { - u64 tmp, tmp2; - lv1_get_version_info(&tmp, &tmp2); + if (replay) { + __replay_interrupt(replay); + return; } + /* Finally, let's ensure we are hard enabled */ __hard_irq_enable(); } EXPORT_SYMBOL(arch_local_irq_restore); + +/* + * This is specifically called by assembly code to re-enable interrupts + * if they are currently disabled. This is typically called before + * schedule() or do_signal() when returning to userspace. We do it + * in C to avoid the burden of dealing with lockdep etc... + */ +void restore_interrupts(void) +{ + if (irqs_disabled()) + local_irq_enable(); +} + #endif /* CONFIG_PPC64 */ int arch_show_interrupts(struct seq_file *p, int prec) @@ -360,8 +439,17 @@ void do_IRQ(struct pt_regs *regs) check_stack_overflow(); + /* + * Query the platform PIC for the interrupt & ack it. + * + * This will typically lower the interrupt line to the CPU + */ irq = ppc_md.get_irq(); + /* We can hard enable interrupts now */ + may_hard_irq_enable(); + + /* And finally process it */ if (irq != NO_IRQ && irq != NO_IRQ_IGNORE) handle_one_irq(irq); else if (irq != NO_IRQ_IGNORE) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index bf80a1d5f8fe..e40707032ac3 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -647,6 +647,9 @@ void show_regs(struct pt_regs * regs) printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); +#ifdef CONFIG_PPC64 + printk("SOFTE: %ld\n", regs->softe); +#endif trap = TRAP(regs); if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) printk("CFAR: "REG"\n", regs->orig_gpr3); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 567dd7c3ac2a..f81c81b92f0e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -259,7 +259,6 @@ void accumulate_stolen_time(void) u64 sst, ust; u8 save_soft_enabled = local_paca->soft_enabled; - u8 save_hard_enabled = local_paca->hard_enabled; /* We are called early in the exception entry, before * soft/hard_enabled are sync'ed to the expected state @@ -268,7 +267,6 @@ void accumulate_stolen_time(void) * complain */ local_paca->soft_enabled = 0; - local_paca->hard_enabled = 0; sst = scan_dispatch_log(local_paca->starttime_user); ust = scan_dispatch_log(local_paca->starttime); @@ -277,7 +275,6 @@ void accumulate_stolen_time(void) local_paca->stolen_time += ust + sst; local_paca->soft_enabled = save_soft_enabled; - local_paca->hard_enabled = save_hard_enabled; } static inline u64 calculate_stolen_time(u64 stop_tb) @@ -580,6 +577,11 @@ void timer_interrupt(struct pt_regs * regs) if (!cpu_online(smp_processor_id())) return; + /* Conditionally hard-enable interrupts now that the DEC has been + * bumped to its maximum value + */ + may_hard_irq_enable(); + trace_timer_interrupt_entry(regs); __get_cpu_var(irq_stat).timer_irqs++; -- cgit v1.2.2 From 10241842fbe900276634fee8d37ec48a7d8a762f Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Sun, 6 Nov 2011 11:51:07 -0600 Subject: powerpc: Add initial e6500 cpu support Add basic support for e6500 core in its single threaded mode. Signed-off-by: Kumar Gala --- arch/powerpc/kernel/cputable.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 81db9e2a8a20..4dccf51064ea 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -2019,6 +2019,24 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_e500mc, .platform = "ppce5500", }, + { /* e6500 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80400000, + .cpu_name = "e6500", + .cpu_features = CPU_FTRS_E6500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e6500", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .cpu_setup = __setup_cpu_e5500, + .cpu_restore = __restore_cpu_e5500, + .machine_check = machine_check_e500mc, + .platform = "ppce6500", + }, #ifdef CONFIG_PPC32 { /* default match */ .pvr_mask = 0x00000000, -- cgit v1.2.2 From 01e8ec4417d3c484986af0adaa0ae6632e0a59cd Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 16 Mar 2012 09:26:59 +1100 Subject: powerpc: Fix power4/970 idle code regression with lockdep in commit 7230c5644188cd9e3fb380cc97dde00c464a3ba7 "powerpc: Rework lazy-interrupt handling" I introduced a regression, accidentally calling irq tracing twice and not properly restoring a clobbered register (r7) later used for writing to the MSR. This caused lockups when booting on a G5 with lockdep enabled. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/idle_power4.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index d8cdba4c28b2..2c71b0fc9f91 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -50,9 +50,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) addi r1,r1,128 ld r0,16(r1) mtlr r0 + mfmsr r7 #endif /* CONFIG_TRACE_IRQFLAGS */ - TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13) /* we'll hard-enable shortly */ BEGIN_FTR_SECTION -- cgit v1.2.2 From 7c801160be0adf826b7b792ee4eaf6a3ae47569d Mon Sep 17 00:00:00 2001 From: Vinh Nguyen Huu Tuong Date: Tue, 20 Dec 2011 02:43:34 +0000 Subject: powerpc/44x: The bug fixed support for APM821xx SoC and Bluestone board This patch consists of: - Fix the pvr mask for checking pvr in cputable.c - Fix the cpu name as consistent with cpu name is describled in dts file Signed-off-by: Vinh Nguyen Huu Tuong Signed-off-by: Josh Boyer --- arch/powerpc/kernel/cputable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 81db9e2a8a20..87353baf21fb 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -1816,7 +1816,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .platform = "ppc440", }, { /* 464 in APM821xx */ - .pvr_mask = 0xffffff00, + .pvr_mask = 0xfffffff0, .pvr_value = 0x12C41C80, .cpu_name = "APM821XX", .cpu_features = CPU_FTRS_44X, -- cgit v1.2.2 From f5339277eb8d3aed37f12a27988366f68ab68930 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 15 Mar 2012 18:18:00 +0000 Subject: powerpc: Remove FW_FEATURE ISERIES from arch code This is no longer selectable, so just remove all the dependent code. Signed-off-by: Stephen Rothwell Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/irq.c | 14 ----- arch/powerpc/kernel/isa-bridge.c | 3 -- arch/powerpc/kernel/lparcfg.c | 108 +++------------------------------------ arch/powerpc/kernel/paca.c | 12 ++--- arch/powerpc/kernel/pci-common.c | 15 ------ arch/powerpc/kernel/sysfs.c | 7 +-- arch/powerpc/kernel/time.c | 108 ++------------------------------------- 7 files changed, 15 insertions(+), 252 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index eb804e15b29b..45b367c8d8b8 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -211,11 +211,6 @@ notrace void arch_local_irq_restore(unsigned long en) * External interrupt events on non-iseries will have caused * interrupts to be hard-disabled, so there is no problem, we * cannot have preempted. - * - * That leaves us with EEs on iSeries or decrementer interrupts, - * which I decided to safely ignore. The preemption would have - * itself been the result of an interrupt, upon which return we - * will have checked for pending events on the old CPU. */ irq_happened = get_irq_happened(); if (!irq_happened) @@ -458,15 +453,6 @@ void do_IRQ(struct pt_regs *regs) irq_exit(); set_irq_regs(old_regs); -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES) && - get_lppaca()->int_dword.fields.decr_int) { - get_lppaca()->int_dword.fields.decr_int = 0; - /* Signal a fake decrementer interrupt */ - timer_interrupt(regs); - } -#endif - trace_irq_exit(regs); } diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 479752901ec6..d45ec58703ce 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -29,7 +29,6 @@ #include #include #include -#include unsigned long isa_io_base; /* NULL if no ISA bus */ EXPORT_SYMBOL(isa_io_base); @@ -261,8 +260,6 @@ static struct notifier_block isa_bridge_notifier = { */ static int __init isa_bridge_init(void) { - if (firmware_has_feature(FW_FEATURE_ISERIES)) - return 0; bus_register_notifier(&pci_bus_type, &isa_bridge_notifier); return 0; } diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index 578f35f18723..ac12bd80ad95 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -55,80 +54,14 @@ static unsigned long get_purr(void) int cpu; for_each_possible_cpu(cpu) { - if (firmware_has_feature(FW_FEATURE_ISERIES)) - sum_purr += lppaca_of(cpu).emulated_time_base; - else { - struct cpu_usage *cu; + struct cpu_usage *cu; - cu = &per_cpu(cpu_usage_array, cpu); - sum_purr += cu->current_tb; - } + cu = &per_cpu(cpu_usage_array, cpu); + sum_purr += cu->current_tb; } return sum_purr; } -#ifdef CONFIG_PPC_ISERIES - -/* - * Methods used to fetch LPAR data when running on an iSeries platform. - */ -static int iseries_lparcfg_data(struct seq_file *m, void *v) -{ - unsigned long pool_id; - int shared, entitled_capacity, max_entitled_capacity; - int processors, max_processors; - unsigned long purr = get_purr(); - - shared = (int)(local_paca->lppaca_ptr->shared_proc); - - seq_printf(m, "system_active_processors=%d\n", - (int)HvLpConfig_getSystemPhysicalProcessors()); - - seq_printf(m, "system_potential_processors=%d\n", - (int)HvLpConfig_getSystemPhysicalProcessors()); - - processors = (int)HvLpConfig_getPhysicalProcessors(); - seq_printf(m, "partition_active_processors=%d\n", processors); - - max_processors = (int)HvLpConfig_getMaxPhysicalProcessors(); - seq_printf(m, "partition_potential_processors=%d\n", max_processors); - - if (shared) { - entitled_capacity = HvLpConfig_getSharedProcUnits(); - max_entitled_capacity = HvLpConfig_getMaxSharedProcUnits(); - } else { - entitled_capacity = processors * 100; - max_entitled_capacity = max_processors * 100; - } - seq_printf(m, "partition_entitled_capacity=%d\n", entitled_capacity); - - seq_printf(m, "partition_max_entitled_capacity=%d\n", - max_entitled_capacity); - - if (shared) { - pool_id = HvLpConfig_getSharedPoolIndex(); - seq_printf(m, "pool=%d\n", (int)pool_id); - seq_printf(m, "pool_capacity=%d\n", - (int)(HvLpConfig_getNumProcsInSharedPool(pool_id) * - 100)); - seq_printf(m, "purr=%ld\n", purr); - } - - seq_printf(m, "shared_processor_mode=%d\n", shared); - - return 0; -} - -#else /* CONFIG_PPC_ISERIES */ - -static int iseries_lparcfg_data(struct seq_file *m, void *v) -{ - return 0; -} - -#endif /* CONFIG_PPC_ISERIES */ - -#ifdef CONFIG_PPC_PSERIES /* * Methods used to fetch LPAR data when running on a pSeries platform. */ @@ -648,8 +581,7 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, u8 new_weight, *new_weight_ptr = &new_weight; ssize_t retval; - if (!firmware_has_feature(FW_FEATURE_SPLPAR) || - firmware_has_feature(FW_FEATURE_ISERIES)) + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return -EINVAL; if (count > kbuf_sz) @@ -709,21 +641,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return retval; } -#else /* CONFIG_PPC_PSERIES */ - -static int pseries_lparcfg_data(struct seq_file *m, void *v) -{ - return 0; -} - -static ssize_t lparcfg_write(struct file *file, const char __user * buf, - size_t count, loff_t * off) -{ - return -EINVAL; -} - -#endif /* CONFIG_PPC_PSERIES */ - static int lparcfg_data(struct seq_file *m, void *v) { struct device_node *rootdn; @@ -738,19 +655,11 @@ static int lparcfg_data(struct seq_file *m, void *v) rootdn = of_find_node_by_path("/"); if (rootdn) { tmp = of_get_property(rootdn, "model", NULL); - if (tmp) { + if (tmp) model = tmp; - /* Skip "IBM," - see platforms/iseries/dt.c */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - model += 4; - } tmp = of_get_property(rootdn, "system-id", NULL); - if (tmp) { + if (tmp) system_id = tmp; - /* Skip "IBM," - see platforms/iseries/dt.c */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - system_id += 4; - } lp_index_ptr = of_get_property(rootdn, "ibm,partition-no", NULL); if (lp_index_ptr) @@ -761,8 +670,6 @@ static int lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "system_type=%s\n", model); seq_printf(m, "partition_id=%d\n", (int)lp_index); - if (firmware_has_feature(FW_FEATURE_ISERIES)) - return iseries_lparcfg_data(m, v); return pseries_lparcfg_data(m, v); } @@ -786,8 +693,7 @@ static int __init lparcfg_init(void) umode_t mode = S_IRUSR | S_IRGRP | S_IROTH; /* Allow writing if we have FW_FEATURE_SPLPAR */ - if (firmware_has_feature(FW_FEATURE_SPLPAR) && - !firmware_has_feature(FW_FEATURE_ISERIES)) + if (firmware_has_feature(FW_FEATURE_SPLPAR)) mode |= S_IWUSR; ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 41456ff55e14..0bb1f98613ba 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -11,13 +11,10 @@ #include #include -#include #include #include #include #include -#include -#include #include /* This symbol is provided by the linker - let it fill in the paca @@ -30,8 +27,8 @@ extern unsigned long __toc_start; * The structure which the hypervisor knows about - this structure * should not cross a page boundary. The vpa_init/register_vpa call * is now known to fail if the lppaca structure crosses a page - * boundary. The lppaca is also used on legacy iSeries and POWER5 - * pSeries boxes. The lppaca is 640 bytes long, and cannot readily + * boundary. The lppaca is also used on POWER5 pSeries boxes. + * The lppaca is 640 bytes long, and cannot readily * change since the hypervisor knows its layout, so a 1kB alignment * will suffice to ensure that it doesn't cross a page boundary. */ @@ -183,12 +180,9 @@ void __init allocate_pacas(void) /* * We can't take SLB misses on the paca, and we want to access them * in real mode, so allocate them within the RMA and also within - * the first segment. On iSeries they must be within the area mapped - * by the HV, which is HvPagesToMap * HVPAGESIZE bytes. + * the first segment. */ limit = min(0x10000000ULL, ppc64_rma_size); - if (firmware_has_feature(FW_FEATURE_ISERIES)) - limit = min(limit, HvPagesToMap * HVPAGESIZE); paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index cce98d76e905..d0373bcb7c9d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -38,7 +38,6 @@ #include #include #include -#include #include static DEFINE_SPINLOCK(hose_spinlock); @@ -219,20 +218,6 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) struct of_irq oirq; unsigned int virq; - /* The current device-tree that iSeries generates from the HV - * PCI informations doesn't contain proper interrupt routing, - * and all the fallback would do is print out crap, so we - * don't attempt to resolve the interrupts here at all, some - * iSeries specific fixup does it. - * - * In the long run, we will hopefully fix the generated device-tree - * instead. - */ -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES)) - return -1; -#endif - pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); #ifdef DEBUG diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 883e74c0d1b3..0c683d376b1c 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -341,8 +340,7 @@ static void __cpuinit register_cpu_online(unsigned int cpu) int i, nattrs; #ifdef CONFIG_PPC64 - if (!firmware_has_feature(FW_FEATURE_ISERIES) && - cpu_has_feature(CPU_FTR_SMT)) + if (cpu_has_feature(CPU_FTR_SMT)) device_create_file(s, &dev_attr_smt_snooze_delay); #endif @@ -414,8 +412,7 @@ static void unregister_cpu_online(unsigned int cpu) BUG_ON(!c->hotpluggable); #ifdef CONFIG_PPC64 - if (!firmware_has_feature(FW_FEATURE_ISERIES) && - cpu_has_feature(CPU_FTR_SMT)) + if (cpu_has_feature(CPU_FTR_SMT)) device_remove_file(s, &dev_attr_smt_snooze_delay); #endif diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f81c81b92f0e..2c42cd72d0f5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -17,8 +17,7 @@ * * TODO (not necessarily in this file): * - improve precision and reproducibility of timebase frequency - * measurement at boot time. (for iSeries, we calibrate the timebase - * against the Titan chip's clock.) + * measurement at boot time. * - for astronomical applications: add a new function to get * non ambiguous timestamps even around leap seconds. This needs * a new timestamp format and a good name. @@ -70,10 +69,6 @@ #include #include #include -#ifdef CONFIG_PPC_ISERIES -#include -#include -#endif /* powerpc clocksource/clockevent code */ @@ -117,14 +112,6 @@ static struct clock_event_device decrementer_clockevent = { DEFINE_PER_CPU(u64, decrementers_next_tb); static DEFINE_PER_CPU(struct clock_event_device, decrementers); -#ifdef CONFIG_PPC_ISERIES -static unsigned long __initdata iSeries_recal_titan; -static signed long __initdata iSeries_recal_tb; - -/* Forward declaration is only needed for iSereis compiles */ -static void __init clocksource_init(void); -#endif - #define XSEC_PER_SEC (1024*1024) #ifdef CONFIG_PPC64 @@ -423,74 +410,6 @@ unsigned long profile_pc(struct pt_regs *regs) EXPORT_SYMBOL(profile_pc); #endif -#ifdef CONFIG_PPC_ISERIES - -/* - * This function recalibrates the timebase based on the 49-bit time-of-day - * value in the Titan chip. The Titan is much more accurate than the value - * returned by the service processor for the timebase frequency. - */ - -static int __init iSeries_tb_recal(void) -{ - unsigned long titan, tb; - - /* Make sure we only run on iSeries */ - if (!firmware_has_feature(FW_FEATURE_ISERIES)) - return -ENODEV; - - tb = get_tb(); - titan = HvCallXm_loadTod(); - if ( iSeries_recal_titan ) { - unsigned long tb_ticks = tb - iSeries_recal_tb; - unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12; - unsigned long new_tb_ticks_per_sec = (tb_ticks * USEC_PER_SEC)/titan_usec; - unsigned long new_tb_ticks_per_jiffy = - DIV_ROUND_CLOSEST(new_tb_ticks_per_sec, HZ); - long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy; - char sign = '+'; - /* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */ - new_tb_ticks_per_sec = new_tb_ticks_per_jiffy * HZ; - - if ( tick_diff < 0 ) { - tick_diff = -tick_diff; - sign = '-'; - } - if ( tick_diff ) { - if ( tick_diff < tb_ticks_per_jiffy/25 ) { - printk( "Titan recalibrate: new tb_ticks_per_jiffy = %lu (%c%ld)\n", - new_tb_ticks_per_jiffy, sign, tick_diff ); - tb_ticks_per_jiffy = new_tb_ticks_per_jiffy; - tb_ticks_per_sec = new_tb_ticks_per_sec; - calc_cputime_factors(); - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - setup_cputime_one_jiffy(); - } - else { - printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" - " new tb_ticks_per_jiffy = %lu\n" - " old tb_ticks_per_jiffy = %lu\n", - new_tb_ticks_per_jiffy, tb_ticks_per_jiffy ); - } - } - } - iSeries_recal_titan = titan; - iSeries_recal_tb = tb; - - /* Called here as now we know accurate values for the timebase */ - clocksource_init(); - return 0; -} -late_initcall(iSeries_tb_recal); - -/* Called from platform early init */ -void __init iSeries_time_init_early(void) -{ - iSeries_recal_tb = get_tb(); - iSeries_recal_titan = HvCallXm_loadTod(); -} -#endif /* CONFIG_PPC_ISERIES */ - #ifdef CONFIG_IRQ_WORK /* @@ -546,16 +465,6 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ -/* - * For iSeries shared processors, we have to let the hypervisor - * set the hardware decrementer. We set a virtual decrementer - * in the lppaca and call the hypervisor if the virtual - * decrementer is less than the current value in the hardware - * decrementer. (almost always the new decrementer value will - * be greater than the current hardware decementer so the hypervisor - * call will not be needed) - */ - /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. @@ -599,20 +508,10 @@ void timer_interrupt(struct pt_regs * regs) irq_work_run(); } -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES)) - get_lppaca()->int_dword.fields.decr_int = 0; -#endif - *next_tb = ~(u64)0; if (evt->event_handler) evt->event_handler(evt); -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending()) - process_hvlpevents(); -#endif - #ifdef CONFIG_PPC64 /* collect purr register values often, for accurate calculations */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) { @@ -984,9 +883,8 @@ void __init time_init(void) */ start_cpu_decrementer(); - /* Register the clocksource, if we're not running on iSeries */ - if (!firmware_has_feature(FW_FEATURE_ISERIES)) - clocksource_init(); + /* Register the clocksource */ + clocksource_init(); init_decrementer_clockevent(); } -- cgit v1.2.2 From 1b041885ae1d9938440fc2cf6a444b70ec0a86c9 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 15 Mar 2012 18:20:13 +0000 Subject: powerpc: Remove the remaining CONFIG_PPC_ISERIES pieces Signed-off-by: Stephen Rothwell Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/asm-offsets.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index cdd0d264415f..cc492e48ddfa 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -46,9 +46,6 @@ #include #include #endif -#ifdef CONFIG_PPC_ISERIES -#include -#endif #ifdef CONFIG_PPC_POWERNV #include #endif @@ -384,17 +381,6 @@ int main(void) DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif -#ifdef CONFIG_PPC_ISERIES - /* the assembler miscalculates the VSID values */ - DEFINE(PAGE_OFFSET_ESID, GET_ESID(PAGE_OFFSET)); - DEFINE(PAGE_OFFSET_VSID, KERNEL_VSID(PAGE_OFFSET)); - DEFINE(VMALLOC_START_ESID, GET_ESID(VMALLOC_START)); - DEFINE(VMALLOC_START_VSID, KERNEL_VSID(VMALLOC_START)); - - /* alpaca */ - DEFINE(ALPACA_SIZE, sizeof(struct alpaca)); -#endif - DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); DEFINE(PTE_SIZE, sizeof(pte_t)); -- cgit v1.2.2